Here, I analyse and document my progress with the analysis of a world-wide whole genome sampling of Zymoseptoria tritici. Some of the analysis are just exploratory while some other are lined up in a clear path to answering questions related to the history of Z.tritici and to better understand its adaptation to various climates.

library(knitr)
library(reticulate)

#Data wrangling and data viz
library(tidyverse)
library(purrr)
library(RColorBrewer)
library(plotly)
library(cowplot)
library(GGally)
library(corrplot)
library(ggstance)
library('pophelper')
library(ggbiplot)
library(igraph)
library(ggraph)
library(ggrepel)
library(ggtext)
library(scatterpie)
library(pheatmap)
library(ggridges)


library(ggtree)
library(tidytree)
library(multcomp)
library(lsmeans)

#Statistics
library(car)
library(corrr)
library(lsmeans)
library(multcomp)

#Variables
world <- map_data("world")
project_dir="~/Documents/Postdoc_Bruce/Projects/WW_project/"
lists_dir = "~/Documents/Postdoc_Bruce/Projects/WW_project/WW_PopGen/Keep_lists_samples/"

#Data directories
data_dir=paste0(project_dir, "0_Data/")
metadata_dir=paste0(project_dir, "Metadata/")
fig_dir = "~/Documents/Postdoc_Bruce/Manuscripts/Feurtey_WW_Zt/Draft_figures/"

#Analysis directories
#-___________________
VAR_dir = paste0(project_dir, "1_Variant_calling/")
  depth_per_window_dir = paste0(VAR_dir, "1_Depth_per_window/")
  depth_per_gene_dir = paste0(VAR_dir, "2_Depth_per_gene/")
  vcf_dir = paste0(VAR_dir, "4_Joint_calling/")
  mito_SV = paste0(VAR_dir, "6_Mito_SV/")
PopStr_dir = paste0(project_dir, "2_Population_structure/")
  nuc_PS_dir=paste0(PopStr_dir, "0_Nuclear_genome/")
  mito_PS_dir = paste0(PopStr_dir, "1_Mitochondrial_genome/")
Sumstats_dir = paste0(project_dir, "3_Sumstats_demography/")
TE_RIP_dir=paste0(project_dir, "4_TE_RIP/")
   RIP_DIR=paste0(TE_RIP_dir, "0_RIP_estimation/")
   DIM2_DIR=paste0(TE_RIP_dir, "1_Blast_from_denovo_assemblies/")
GEA_dir=paste0(project_dir, "5_GEA/")
fung_dir=paste0(project_dir, "6_Fungicide_resistance/")
virulence_dir = paste0(project_dir, "7_Virulence/")
sel_dir = paste0(project_dir, "8_Selection/")
  gene_list_dir = paste0(sel_dir, "0_Lists_unique_copy/")


#Files
vcf_name="Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.biall_SNP.max-m-1.maf-0.05.thin-1000bp"
vcf_name_nomaf="Ztritici_global_March2021.filtered-clean.AB_filtered.SNP.max-m-0.8.thin-1000bp"
vcf_name_mito = "Ztritici_global_March2021.genotyped.mt.filtered.clean.AB_filtered.variants.good_samples.max-m-80"
Zt_list = paste0(lists_dir, "Ztritici_global_March2021.genotyped.good_samples.args")
effectors_annotation_file = "~/Documents/Postdoc_Eva/Manuscripts/Accepted/Alice_Cecile_Comparative_genomics/Data_for_publication/Annotations_2018_genomes_for_publication.tab"
eggnog_annotation = paste0(data_dir, "Zymoseptoria_tritici.MG2.Grandaubert2015.eggnog")
gff_file = paste0(data_dir, "Zymoseptoria_tritici.MG2.Grandaubert2015.no_CDS.gff3")
ref_fasta_file = paste0(data_dir, "Zymoseptoria_tritici.MG2.dna.toplevel.mt+.fa")
metadata_name = "Main_table_from_SQL_Feb_2020"
gene_annotation = read_tsv(paste0(data_dir, "Badet_GLOBAL_PANGENOME_TABLE.txt"))
complete_mito = read_tsv(paste0(data_dir, "Complete_mitochondria_from_blast.txt"), col_names = c("ID_file", "Contig"))

Sys.setenv(PROJECTDIR=project_dir)
Sys.setenv(VARDIR=VAR_dir)
Sys.setenv(VCFDIR=vcf_dir)
Sys.setenv(POPSTR=PopStr_dir)
Sys.setenv(MITOPOPSTR=mito_PS_dir)
Sys.setenv(TERIP=TE_RIP_dir)

Sys.setenv(SUMST=Sumstats_dir)
Sys.setenv(GEADIR=GEA_dir)

Sys.setenv(ZTLIST=Zt_list)
Sys.setenv(GFFFILE = gff_file)
Sys.setenv(REFFILE = ref_fasta_file)
Sys.setenv(VCFNAME=vcf_name)
Sys.setenv(VCFNAME_NOMAF=vcf_name_nomaf)
Sys.setenv(VCFNAME_MITO=vcf_name_mito)

#knitr::opts_chunk$set(echo = F)
knitr::opts_chunk$set(message = F)
knitr::opts_chunk$set(warning = F)
knitr::opts_chunk$set(results = T)


# Metadata and sample lists
##Filtered_samples
filtered_samples = bind_rows(
  read_tsv(paste0(metadata_dir, "Sample_removed_based_on_IBS.args"), col_names = "ID_file") %>%
  mutate(Filter = "IBS"),
read_tsv(paste0(metadata_dir, "Sample_with_too_much_NA.args"), col_names = "ID_file") %>%
  mutate(Filter = "High_NA"),
read_tsv(paste0(metadata_dir, "Samples_to_filter_out.args"), col_names = "ID_file") %>%
  mutate(Filter = "Mutants_etc"))

##Samples in vcf
genotyped_samples = read_tsv(Zt_list, col_names = "ID_file")

## Metadata of genotyped samples 
temp = read_tsv(paste0(metadata_dir, metadata_name, "_Description.tab"), col_names = F) %>% pull()

Zt_meta = read_delim(paste0(metadata_dir, metadata_name, "_with_collection.tab"), 
                 col_names = temp, delim = "\t",
                 na = "\\N", guess_max = 2000) %>%
  unite(Coordinates, Latitude, Longitude, sep = ";", remove = F) %>%
  inner_join(., genotyped_samples)  %>%
  mutate(Country = ifelse(Country == "USA", paste(Country, Region, sep = "_"), Country)) %>%
  mutate(Country = ifelse(Country == "Australia", paste(Country, Region, sep = "_"), Country)) %>%
  mutate(Country = ifelse(Country == "NZ", "New Zealand", Country)) %>%
  mutate(Country = ifelse(Country == "CH", "Switzerland", Country)) %>%
  mutate(Latitude2 = round(Latitude, 2), Longitude2 = round(Longitude, 2)) %>%
  dplyr::select(ID_file, Continent, Country, Latitude, Longitude, Latitude2, Longitude2,
                Sampling_Date, Collection)

temp = read_tsv(paste0(PopStr_dir, vcf_name, ".high_anc_coef_snmf.tsv")) %>%
  dplyr::select(ID_file = Sample, Cluster)

Zt_meta = full_join(Zt_meta, temp)

#genotyped_samples %>%
#  filter(!(ID_file %in% filtered_samples$ID_file)) %>%
#    write_tsv(Zt_list, col_names = F)



#Define colors
## For continents
#myColors <- c("#04078B", "#a10208", "#FFBA08", "#CC0044", "#5C9D06", "#129EBA","#305D1B")
myColors <- c("#DA4167", "grey", "#ffba0a", "#A20106", "#3F6E0C", "#129eba", "#8fa253" )
names(myColors) = levels(factor(Zt_meta$Continent))
Color_Continent = ggplot2::scale_colour_manual(name = "Continent", values = myColors)
Fill_Continent = ggplot2::scale_fill_manual(name = "Continent", values = myColors)

myColors2 <- c("#129eba", "#3F6E0C", "#DA4167", "#ffba0a", "#129eba", "#129eba", 
               "#8fa253", "#A20106", "#A20106", "#3F6E0C", "#8fa253")
names(myColors2) = levels(factor(Zt_meta$Cluster))
Color_Cluster = ggplot2::scale_colour_manual(name = "Cluster", values = myColors2)
Fill_Cluster = ggplot2::scale_fill_manual(name = "Cluster", values = myColors2)

## For clustering
K_colors = c("#f9c74f", "#f9844a", "#90be6d", "#f5cac3", 
"#83c5be", "#f28482", "#577590", "#e5e5e5", "#a09abc",  "#52796f",
"#219ebc", "#003049", "#82c0cc", "#283618", "white")

## For correlations
mycolorsCorrel<- colorRampPalette(c("#0f8b8d", "white", "#a8201a"))(20)

#Random gradients
greens=c("#1B512D", "#669046", "#8CB053", "#B1CF5F", "#514F59")
blues=c("#08386E", "#1C89C9", "#28A7C0", "#B0DFE8", "grey")
#Run on the cluster

#Create bed file with 10kb windows
#(including the last window which can be smaller)
while read chr length temp temp2 temp3; do 
  start=0; 
  while [ "$start" -le "$length" ] ; do 
    if [ "$(($start + 10000))" -le  "$length" ] ; 
    then 
      echo -e "${chr}\t${start}\t$(($start + 10000))" ; 
    else  echo -e "${chr}\t${start}\t$length" ;  
    fi ; 
    start=`expr $start + 10000` ; 
  done ; 
done < /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.fa.fai > /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed


#GC etc from bedtools nuc
~/Software/bedtools nuc \
    -fi /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.fa \
    -bed /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed \
    > /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.nuc_GC.tab


#RIP per window 
#(my script makes a summary for all seq in a multifasta, so I tricked it my giving it a single window at a time)
 #columns are "CHROM", "Start", "End", "GC", "Product index", "Substrate index", "Composite"
 while read chr start end ; 
    do 
    echo -e "${chr}\t${start}\t${end}" > temp.bed ; 
    ~/Software/bedtools getfasta -fi /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.fa \
       -bed temp.bed -fo temp.fa ; 
    python GC_RIP_per_read_fastq.py --input_format fasta --out temp temp.fa ; 
    values=$(cat temp.txt | ~/Software/datamash-1.3/datamash transpose | grep "Median" | cut -f 2,3,4,5) ; 
    echo -e "${chr}\t${start}\t${end}\t$values" \
    >> /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.RIP.tsv ; 
done < /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed


#Count variants
#columns are CHROM, Start(IN 10KB UNITS), Variant_number
zcat /data2/alice/WW_project/1_Variant_calling/4_Joint_calling/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.vcf.gz  | \
   grep -v "#"  | awk 'BEGIN {FS="\t"; OFS="\t"} {print $1,$2,int($2/10000)}' | \
   ~/Software/bedtools groupby -g 1,3 -o count -c 2 > \
   /data2/alice/WW_project/1_Variant_calling/4_Joint_calling/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.10kb_windows.SNP_counts.tab
   
   
#Gene coverage
#columns are CHROM, Start, End, Nb_overlapping_genes, Coverage_bp_gene, Window_length, Coverage_frac_gene
~/Software/bedtools coverage \
   -a /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed \
   -b /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.Grandaubert2015.mRNA.gff3 \
   > /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.gene_coverage.tab
 
 #Reference TE coverage
#columns are CHROM, Start, End, Nb_overlapping_TEs, Coverage_bp_TE, Window_length, Coverage_frac_TE
 ~/Software/bedtools coverage \
    -a /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed \
    -b /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.Badet_Oggenfuss_2019.TE.gtf \
    > /data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.TE_coverage.tab
# Uploading commands from the cluster to my computer.

# TE and RIP
rsync -avP \
  alice@130.125.25.244:/data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/Nb_reads* \
  ~/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/0_RIP_estimation/
# Nb_reads_per_TE.txt
# Nb_reads.txt
rsync -avP \
  alice@130.125.25.244:/data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/Composite_index.txt \
  ~/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/0_RIP_estimation/
rsync -avP   alice@130.125.25.244:/data2/alice/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/1_Blast_dim2_deRIPped/ \
 ~/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/
  
#Per windows: coordinates, RIP, GC, SNP counts
rsync -avP   alice@130.125.25.244:/data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed \
 ~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.bed
 
rsync -avP   alice@130.125.25.244:/data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.RIP.tsv \
 ~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.RIP.tsv 
 
rsync -avP   alice@130.125.25.244:/data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.nuc_GC.tab \
 ~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.nuc_GC.tab
 
rsync -avP   alice@130.125.25.244:/data2/alice/WW_project/1_Variant_calling/4_Joint_calling/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.10kb_windows.SNP_counts.tab  \
 ~/Documents/Postdoc_Bruce/Projects/WW_project/1_Variant_calling/4_Joint_calling/
 
rsync -avP   alice@130.125.25.244:/data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.gene_coverage.tab \
~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/

rsync -avP   alice@130.125.25.244:/data2/alice/WW_project/0_Data/Zymoseptoria_tritici.MG2.dna.toplevel.mt+.10kb_windows.TE_coverage.tab \
 ~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/

Previously, based on the study of TE and RIP in fully-assembled Z.tritici genomes, a hypothesis was drawn. The lower RIP in TEs of European samples, as compared to Iranian isolates, could indicate a loss of RIP in Z.tritici when it spread out of its area of origin. Here, I would like to investigate this possibility in the different pop.

Transposable elements: distribution, population structure, and genomic architecture

The data plotted here are based on the following steps:

  • Detect the TE insertions using ngs_TE_mapper version 2.
  • Map the reads on TE consensus (from Lorrain et al. 2020, so it includes only Iranian and European samples) and create two bins: reads mapping on TEs and reads that are not mapping.
  • Measuring the RIP composite index for reads that mapped on TE.
  • Estimating the index median in TE reads per isolate.

TE content estimation: method comparison

First, I mapped the reads on the TE consensus created by Ursula based on Thomas’s pangenome. I also detected reference andnon-reference TE insertions with ngs_te_mapper2. I visualize the results here in terms of percentage of reads mapping on these consensus and of number of insertions. I try here to see if there are biases between collections, as well as to compare the different TE content estimations that I have used.

#Reading in the data
TE_qty = read_delim(paste0(RIP_DIR, "Nb_reads.txt"), delim = " ") %>%
  dplyr::filter(Total_reads > Te_aligned_reads) %>%
  dplyr::mutate(Percent_TE_Reads = Te_aligned_reads * 100 / Total_reads) %>%
  full_join(., Zt_meta) %>%
  unite(Continent, Country, ID_file, col = "for_display", remove = F)

world_avg <-
  TE_qty  %>%
  dplyr::summarize(avg = mean(as.numeric(Percent_TE_Reads), na.rm = T)) %>%
  pull(avg)
#Summarize numbers
#_________________
#grep "Number" /data2/alice/WW_project/4_TE_RIP/4_ngs_TE_mapper/*/ngs_te_mapper.log | grep "non-reference" -v | cut -d ":" -f 1,7 | sed 's|/| |g' | cut -d " " -f 7,9 | sort > /data2/alice/WW_project/4_TE_RIP/4_ngs_TE_mapper/Ref_TE_numbers.txt
#grep "Number" /data2/alice/WW_project/4_TE_RIP/4_ngs_TE_mapper/*/ngs_te_mapper.log | grep "non-reference" | cut -d ":" -f 1,7 | sed 's|/| |g' | cut -d " " -f 7,9 | sort > /data2/alice/WW_project/4_TE_RIP/4_ngs_TE_mapper/Non-ref_TE_numbers.txt
#rsync -avP alice@130.125.25.244:/data2/alice/WW_project/4_TE_RIP/4_ngs_TE_mapper/*_TE_numbers.txt ../4_TE_RIP/



#Gather per strain files into one big file
#_________________________________________
#cd /data2/alice/WW_project/4_TE_RIP/4_ngs_TE_mapper
#for direc in $dir_list ; do if [ -f ./${direc}${direc%/}_1_paired.nonref.bed ] ; then awk -v var=${direc%/} 'BEGIN {OFS = "\t"} {print var, $1,$2,$3,$4,$5}' ./${direc}${direc%/}_1_paired.nonref.bed ; else awk -v var=${direc%/} 'BEGIN {OFS = "\t"} {print var, $1,$2,$3,$4,$5}' ./${direc}${direc%/}.nonref.bed ; fi  ; done > Non-ref_all_strains.bed
#for direc in $dir_list ; do if [ -f ./${direc}${direc%/}_1_paired.ref.bed ] ; then awk -v var=${direc%/} 'BEGIN {OFS = "\t"} {print var, $1,$2,$3,$4,$5}' ./${direc}${direc%/}_1_paired.ref.bed ; else awk -v var=${direc%/} 'BEGIN {OFS = "\t"} {print var, $1,$2,$3,$4,$5}' ./${direc}${direc%/}.ref.bed ; fi  ; done > Ref_all_strains.bed
#rsync -avP alice@130.125.25.244:/data2/alice/WW_project/4_TE_RIP/4_ngs_TE_mapper/*ef_all_strains.bed ../4_TE_RIP/
TE_qty = full_join(read_delim(paste0(TE_RIP_dir, "Non-ref_TE_numbers.txt"), 
                                     col_names = c("ID_file", "Non_ref_insertions"),
                                     delim = " "),
                          read_delim(paste0(TE_RIP_dir, "Ref_TE_numbers.txt"), 
                                     col_names = c("ID_file", "Ref_insertions"),
                                     delim = " "))  %>%
  mutate(Total_insertions = Ref_insertions + Non_ref_insertions) %>%
  filter(Total_insertions > 0) %>%
  left_join(TE_qty)


p1 = TE_qty %>%
  ggplot(aes(x = Non_ref_insertions, y = Ref_insertions, col = Continent)) +
    geom_point(alpha = .8) +
    theme_bw() +
    Color_Continent +
    theme(legend.position = "None")

p2 = TE_qty %>%
  ggplot(aes(x = Percent_TE_Reads, y = Total_insertions, col = Continent)) +
    geom_point(alpha = .8) +
    theme_bw() +
    Color_Continent +
    theme(legend.position = "None")

p3 = TE_qty %>%
  ggplot(aes(x = Percent_TE_Reads, y = Ref_insertions, col = Continent)) +
    geom_point(alpha = .8) +
    theme_bw() +
    Color_Continent +
    theme(legend.position = "None")

p4 = TE_qty %>%
  ggplot(aes(x = Percent_TE_Reads, y = Non_ref_insertions, col = Continent)) +
    geom_point(alpha = .8) +
    theme_bw() +
    Color_Continent +
    theme(legend.position = "None")

cowplot::plot_grid(p1, p2, p3, p4, ncol = 2, nrow = 2)

#Collections comparisons

ggplot(TE_qty, aes(x = Total_insertions, y = Percent_TE_Reads, col = Collection)) +
  geom_point(alpha = .8) +
  theme_bw()  +
  labs(title = "Comparison of whole TE content estimation per sample")

p1 = ggplot(TE_qty, aes(x = Percent_TE_Reads, col = Collection, fill = Collection)) +
  geom_density(alpha = .4) +
  theme_bw() 
p2 = ggplot(TE_qty, aes(x = Total_insertions, col = Collection, fill = Collection)) +
  geom_density(alpha = .4) +
  theme_bw() 
comment = ggplot() + theme_void() + 
                     geom_text(aes(x = 1, y = 1, label = "Comparison of whole TE content estimation per collection"),
                               size = 6)
cowplot::plot_grid(comment, 
                   p1 + theme(legend.position = "None"), 
                   p2 + theme(legend.position = "None"),
                   get_legend(p1 + theme(legend.position = "bottom")),
                   ncol = 1, nrow = 4, rel_heights = c(.3, 1, 1, .5))

p1 = ggplot(TE_qty, aes(x = Ref_insertions, col = Collection, fill = Collection)) +
  geom_density(alpha = .4) +
  theme_bw() 
p2 = ggplot(TE_qty, aes(x = Non_ref_insertions, col = Collection, fill = Collection)) +
  geom_density(alpha = .4) +
  theme_bw() 
comment = ggplot() + theme_void() + 
                     geom_text(aes(x = 1, y = 1, label = "Comparison of reference and non-reference insertions numbers"),
                               size = 6)
cowplot::plot_grid(comment, 
                   p1 + theme(legend.position = "None"), 
                   p2 + theme(legend.position = "None"),
                   get_legend(p1 + theme(legend.position = "bottom")),
                   ncol = 1, nrow = 4, rel_heights = c(.3, 1, 1, .5))

temp = TE_qty %>% filter(!is.na(Continent))%>% filter(!is.na(Collection))
model1 = lm(Non_ref_insertions ~  Collection,
          data=temp)
summary(model1)
## 
## Call:
## lm(formula = Non_ref_insertions ~ Collection, data = temp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -125.176  -20.884   -0.884   19.672  171.116 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       195.328      3.073  63.555  < 2e-16 ***
## CollectionHartmann_FstQst_2015      6.848      4.372   1.566 0.117689    
## CollectionHartmann_Oregon_2016    140.108      4.723  29.665  < 2e-16 ***
## CollectionJGI                      18.749      6.360   2.948 0.003287 ** 
## CollectionJGI_2                     2.297      6.872   0.334 0.738293    
## CollectionJGI_3                    -9.478      8.360  -1.134 0.257251    
## CollectionJGI_4                    -1.328     20.309  -0.065 0.947874    
## CollectionJGI_Thierry              -5.468      6.129  -0.892 0.372584    
## CollectionMMcDonald_2018           12.937      5.218   2.479 0.013360 *  
## CollectionSyngenta                 23.556      3.720   6.331 3.97e-10 ***
## CollectionThird_batch_2018_BM_TM  -35.016      9.220  -3.798 0.000157 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34.77 on 832 degrees of freedom
## Multiple R-squared:  0.6015, Adjusted R-squared:  0.5967 
## F-statistic: 125.6 on 10 and 832 DF,  p-value: < 2.2e-16
model2 = lm(Ref_insertions ~  Collection,
          data=temp)
summary(model2)
## 
## Call:
## lm(formula = Ref_insertions ~ Collection, data = temp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -128.967  -11.800    1.923   13.033   90.442 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       183.000      2.005  91.276  < 2e-16 ***
## CollectionHartmann_FstQst_2015    -71.200      2.852 -24.962  < 2e-16 ***
## CollectionHartmann_Oregon_2016     14.457      3.081   4.692 3.16e-06 ***
## CollectionJGI                     -43.923      4.149 -10.587  < 2e-16 ***
## CollectionJGI_2                   -60.000      4.483 -13.384  < 2e-16 ***
## CollectionJGI_3                   -44.200      5.454  -8.104 1.89e-15 ***
## CollectionJGI_4                    -2.667     13.249  -0.201    0.841    
## CollectionJGI_Thierry             -33.442      3.998  -8.364 2.53e-16 ***
## CollectionMMcDonald_2018            2.824      3.404   0.830    0.407    
## CollectionSyngenta                109.967      2.427  45.309  < 2e-16 ***
## CollectionThird_batch_2018_BM_TM  -72.813      6.015 -12.106  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.68 on 832 degrees of freedom
## Multiple R-squared:  0.9043, Adjusted R-squared:  0.9032 
## F-statistic: 786.6 on 10 and 832 DF,  p-value: < 2.2e-16
model3 = lm(Percent_TE_Reads ~  Collection,
          data=temp)
summary(model3)
## 
## Call:
## lm(formula = Percent_TE_Reads ~ Collection, data = temp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.9485 -0.9017 -0.0092  0.9203 11.9850 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       15.1347     0.1699  89.067  < 2e-16 ***
## CollectionHartmann_FstQst_2015    -5.4028     0.2423 -22.299  < 2e-16 ***
## CollectionHartmann_Oregon_2016     2.9589     0.2626  11.266  < 2e-16 ***
## CollectionJGI                     -0.3348     0.3485  -0.961    0.337    
## CollectionJGI_2                   -7.3423     0.3764 -19.508  < 2e-16 ***
## CollectionJGI_3                    2.5326     0.4575   5.535 4.19e-08 ***
## CollectionJGI_4                    1.5202     1.1099   1.370    0.171    
## CollectionJGI_Thierry              1.6168     0.3359   4.814 1.77e-06 ***
## CollectionMMcDonald_2018           4.0871     0.2905  14.068  < 2e-16 ***
## CollectionSyngenta                -1.5919     0.2049  -7.768 2.40e-14 ***
## CollectionThird_batch_2018_BM_TM  -0.6011     0.5536  -1.086    0.278    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.9 on 815 degrees of freedom
##   (17 observations deleted due to missingness)
## Multiple R-squared:  0.7185, Adjusted R-squared:  0.715 
## F-statistic:   208 on 10 and 815 DF,  p-value: < 2.2e-16

Comparing the 3 models, it is clear that the reference insertions estimation is the most biased of the 3 methods (when taking the sequencing batch into account). The non-reference insertions does not look too bad, until we add the recent Oregon population.

I now want to see if adding the continent information on top of the collection one to the model brings a significant improvement or not.

model11 = lm(Non_ref_insertions ~  Collection + Continent,
          data=temp)
summary(model11)
## 
## Call:
## lm(formula = Non_ref_insertions ~ Collection + Continent, data = temp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -75.673 -15.861  -0.884  13.735 171.116 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      164.4625     4.4906  36.624  < 2e-16 ***
## CollectionHartmann_FstQst_2015   -11.5345     3.6058  -3.199 0.001432 ** 
## CollectionHartmann_Oregon_2016    87.2836     4.0142  21.744  < 2e-16 ***
## CollectionJGI                      3.8918     4.7256   0.824 0.410422    
## CollectionJGI_2                   -3.4800     5.1701  -0.673 0.501074    
## CollectionJGI_3                   14.8525     6.3100   2.354 0.018817 *  
## CollectionJGI_4                   -0.9057    14.8009  -0.061 0.951222    
## CollectionJGI_Thierry              7.8013     4.5485   1.715 0.086696 .  
## CollectionMMcDonald_2018         -19.3695     6.5032  -2.978 0.002982 ** 
## CollectionSyngenta                23.9780     4.1834   5.732 1.39e-08 ***
## CollectionThird_batch_2018_BM_TM -35.7658     7.0670  -5.061 5.14e-07 ***
## ContinentAsia                     30.3033    25.9175   1.169 0.242651    
## ContinentEurope                   30.4432     5.1348   5.929 4.48e-09 ***
## ContinentMiddle East             -17.3732     5.1923  -3.346 0.000857 ***
## ContinentNorth America            83.6901     4.9982  16.744  < 2e-16 ***
## ContinentOceania                  63.1717     6.8577   9.212  < 2e-16 ***
## ContinentSouth America            29.1426     5.4751   5.323 1.32e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.73 on 826 degrees of freedom
## Multiple R-squared:    0.8,  Adjusted R-squared:  0.7961 
## F-statistic: 206.4 on 16 and 826 DF,  p-value: < 2.2e-16
model21 = lm(Ref_insertions ~  Collection + Continent,
          data=temp)
summary(model21)
## 
## Call:
## lm(formula = Ref_insertions ~ Collection + Continent, data = temp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -128.967  -10.457    0.543   12.033   78.458 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       165.826      3.912  42.386  < 2e-16 ***
## CollectionHartmann_FstQst_2015    -78.048      3.141 -24.845  < 2e-16 ***
## CollectionHartmann_Oregon_2016      4.558      3.497   1.303    0.193    
## CollectionJGI                     -52.176      4.117 -12.673  < 2e-16 ***
## CollectionJGI_2                   -67.136      4.504 -14.905  < 2e-16 ***
## CollectionJGI_3                   -47.190      5.497  -8.584  < 2e-16 ***
## CollectionJGI_4                   -18.683     12.895  -1.449    0.148    
## CollectionJGI_Thierry             -35.036      3.963  -8.841  < 2e-16 ***
## CollectionMMcDonald_2018           -6.196      5.666  -1.094    0.274    
## CollectionSyngenta                 93.951      3.645  25.778  < 2e-16 ***
## CollectionThird_batch_2018_BM_TM  -83.241      6.157 -13.520  < 2e-16 ***
## ContinentAsia                      17.415     22.580   0.771    0.441    
## ContinentEurope                    33.190      4.474   7.419 2.93e-13 ***
## ContinentMiddle East                7.138      4.524   1.578    0.115    
## ContinentNorth America             27.074      4.355   6.217 8.02e-10 ***
## ContinentOceania                   26.194      5.975   4.384 1.31e-05 ***
## ContinentSouth America             21.752      4.770   4.560 5.89e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.54 on 826 degrees of freedom
## Multiple R-squared:  0.9143, Adjusted R-squared:  0.9127 
## F-statistic: 551.1 on 16 and 826 DF,  p-value: < 2.2e-16
model31 = lm(Percent_TE_Reads ~  Collection + Continent,
          data=temp)
summary(model31)
## 
## Call:
## lm(formula = Percent_TE_Reads ~ Collection + Continent, data = temp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.9485 -0.9033  0.0001  0.8436 11.5606 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       15.2015     0.3352  45.346  < 2e-16 ***
## CollectionHartmann_FstQst_2015    -6.0746     0.2685 -22.623  < 2e-16 ***
## CollectionHartmann_Oregon_2016     2.4598     0.2981   8.253 6.26e-16 ***
## CollectionJGI                     -0.7569     0.3480  -2.175 0.029911 *  
## CollectionJGI_2                   -8.0273     0.3812 -21.057  < 2e-16 ***
## CollectionJGI_3                    2.6711     0.4643   5.753 1.24e-08 ***
## CollectionJGI_4                    0.6738     1.0868   0.620 0.535471    
## CollectionJGI_Thierry              1.4422     0.3355   4.298 1.93e-05 ***
## CollectionMMcDonald_2018           1.6677     0.4814   3.464 0.000559 ***
## CollectionSyngenta                -2.4384     0.3118  -7.820 1.65e-14 ***
## CollectionThird_batch_2018_BM_TM  -1.2122     0.5653  -2.144 0.032314 *  
## ContinentAsia                      0.4002     1.9140   0.209 0.834415    
## ContinentEurope                    0.7796     0.3818   2.042 0.041475 *  
## ContinentMiddle East              -1.1902     0.3852  -3.090 0.002070 ** 
## ContinentNorth America             0.4323     0.3709   1.165 0.244222    
## ContinentOceania                   2.3526     0.5067   4.643 4.01e-06 ***
## ContinentSouth America             0.2622     0.4069   0.644 0.519539    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.813 on 809 degrees of freedom
##   (17 observations deleted due to missingness)
## Multiple R-squared:  0.7455, Adjusted R-squared:  0.7404 
## F-statistic: 148.1 on 16 and 809 DF,  p-value: < 2.2e-16
#Comparison of simple vs complex models
anova(model1, model11)
## Analysis of Variance Table
## 
## Model 1: Non_ref_insertions ~ Collection
## Model 2: Non_ref_insertions ~ Collection + Continent
##   Res.Df     RSS Df Sum of Sq      F    Pr(>F)    
## 1    832 1005904                                  
## 2    826  504990  6    500914 136.56 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model2, model21)
## Analysis of Variance Table
## 
## Model 1: Ref_insertions ~ Collection
## Model 2: Ref_insertions ~ Collection + Continent
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    832 428076                                  
## 2    826 383300  6     44775 16.082 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model3, model31)
## Analysis of Variance Table
## 
## Model 1: Percent_TE_Reads ~ Collection
## Model 2: Percent_TE_Reads ~ Collection + Continent
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    815 2941.6                                  
## 2    809 2659.7  6    281.95 14.293 1.581e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
temp = TE_qty %>% filter(!is.na(Continent))%>% filter(!is.na(Collection)) %>% filter(!is.na(Cluster))
model11 = lm(Non_ref_insertions ~  Collection + Continent,
          data=temp)
model31 = lm(Percent_TE_Reads ~  Collection + Continent,
          data=temp)
model12 = lm(Non_ref_insertions ~  Collection + Continent + Cluster,
          data=temp)
summary(model12)
## 
## Call:
## lm(formula = Non_ref_insertions ~ Collection + Continent + Cluster, 
##     data = temp)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -75.51 -14.13  -0.91  12.62 170.90 
## 
## Coefficients: (3 not defined because of singularities)
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      148.2760    18.1001   8.192 1.16e-15 ***
## CollectionHartmann_FstQst_2015   -16.0396     3.8237  -4.195 3.07e-05 ***
## CollectionHartmann_Oregon_2016    80.2749     4.2556  18.863  < 2e-16 ***
## CollectionJGI                     -1.3127     5.0295  -0.261 0.794165    
## CollectionJGI_2                    4.4347     5.7895   0.766 0.443936    
## CollectionJGI_3                   10.8945     7.3431   1.484 0.138340    
## CollectionJGI_4                   15.1077    16.6859   0.905 0.365545    
## CollectionJGI_Thierry              7.9616     5.4045   1.473 0.141151    
## CollectionMMcDonald_2018         -39.5282    24.8236  -1.592 0.111740    
## CollectionSyngenta                15.7117     4.3633   3.601 0.000339 ***
## CollectionThird_batch_2018_BM_TM -43.7468     7.0535  -6.202 9.36e-10 ***
## ContinentEurope                    6.3383     8.9825   0.706 0.480645    
## ContinentMiddle East               5.9472    16.9098   0.352 0.725163    
## ContinentNorth America           106.6342    18.8080   5.670 2.07e-08 ***
## ContinentOceania                 102.0648    30.2146   3.378 0.000769 ***
## ContinentSouth America            59.2768    18.9583   3.127 0.001839 ** 
## ClusterV10                         0.2511     6.2934   0.040 0.968184    
## ClusterV11                         1.7960    17.5194   0.103 0.918377    
## ClusterV2                         48.7780    19.3669   2.519 0.011996 *  
## ClusterV3                         -5.9093     7.0367  -0.840 0.401309    
## ClusterV4                        -15.5975    23.5499  -0.662 0.507980    
## ClusterV5                        -18.3211     7.0810  -2.587 0.009865 ** 
## ClusterV6                        -13.0975     7.5300  -1.739 0.082392 .  
## ClusterV7                              NA         NA      NA       NA    
## ClusterV8                              NA         NA      NA       NA    
## ClusterV9                              NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.86 on 724 degrees of freedom
## Multiple R-squared:  0.8366, Adjusted R-squared:  0.8317 
## F-statistic: 168.5 on 22 and 724 DF,  p-value: < 2.2e-16
model32 = lm(Percent_TE_Reads ~  Collection + Continent + Cluster,
          data=temp)
summary(model32)
## 
## Call:
## lm(formula = Percent_TE_Reads ~ Collection + Continent + Cluster, 
##     data = temp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.9540 -0.8384 -0.0147  0.8687 11.0786 
## 
## Coefficients: (3 not defined because of singularities)
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       12.7255     1.3803   9.220  < 2e-16 ***
## CollectionHartmann_FstQst_2015    -6.3569     0.2951 -21.541  < 2e-16 ***
## CollectionHartmann_Oregon_2016     1.9487     0.3277   5.946 4.31e-09 ***
## CollectionJGI                     -0.5999     0.3849  -1.559  0.11952    
## CollectionJGI_2                   -7.7921     0.4435 -17.569  < 2e-16 ***
## CollectionJGI_3                    2.5291     0.5612   4.506 7.72e-06 ***
## CollectionJGI_4                    0.8124     1.2720   0.639  0.52322    
## CollectionJGI_Thierry              1.2944     0.4143   3.124  0.00186 ** 
## CollectionMMcDonald_2018          -2.3335     1.8900  -1.235  0.21737    
## CollectionSyngenta                -2.7579     0.3392  -8.131 1.90e-15 ***
## CollectionThird_batch_2018_BM_TM  -1.2967     0.5728  -2.264  0.02388 *  
## ContinentEurope                    1.5390     0.6865   2.242  0.02528 *  
## ContinentMiddle East               0.8308     1.2894   0.644  0.51955    
## ContinentNorth America             2.2689     1.4335   1.583  0.11393    
## ContinentOceania                  10.8451     2.3014   4.713 2.95e-06 ***
## ContinentSouth America             2.6608     1.4453   1.841  0.06603 .  
## ClusterV10                         1.1505     0.4796   2.399  0.01670 *  
## ClusterV11                         1.9641     1.3336   1.473  0.14128    
## ClusterV2                          2.0416     1.4744   1.385  0.16658    
## ClusterV3                         -3.0521     0.5419  -5.632 2.56e-08 ***
## ClusterV4                         -5.6477     1.7927  -3.150  0.00170 ** 
## ClusterV5                          0.1073     0.5425   0.198  0.84319    
## ClusterV6                          0.1653     0.5746   0.288  0.77373    
## ClusterV7                              NA         NA      NA       NA    
## ClusterV8                              NA         NA      NA       NA    
## ClusterV9                              NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.74 on 709 degrees of freedom
##   (15 observations deleted due to missingness)
## Multiple R-squared:  0.7593, Adjusted R-squared:  0.7518 
## F-statistic: 101.6 on 22 and 709 DF,  p-value: < 2.2e-16
anova(model11, model12)
## Analysis of Variance Table
## 
## Model 1: Non_ref_insertions ~ Collection + Continent
## Model 2: Non_ref_insertions ~ Collection + Continent + Cluster
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    731 400731                                  
## 2    724 378320  7     22412 6.1271 5.656e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model31, model32)
## Analysis of Variance Table
## 
## Model 1: Percent_TE_Reads ~ Collection + Continent
## Model 2: Percent_TE_Reads ~ Collection + Continent + Cluster
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)    
## 1    716 2305.6                               
## 2    709 2146.7  7    158.92 7.4981  1e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The fit is indeed improved by adding the collection information, with both TE quantity estimations.

depth = read_tsv(paste0(vcf_dir, "Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.idepth"))
GC = read_delim(paste0(RIP_DIR, "GC_percent.txt"), col_names = c("ID_file", "TE", "Estimate", "Median_GC", "Mean_GC"), delim = " ") %>%
  dplyr::select(-TE, -Estimate)

biases = inner_join(TE_qty, depth, by = c("ID_file" = "INDV")) %>%
  inner_join(GC)


p1 = biases %>%
  ggplot(aes(x = Mean_GC, y = Percent_TE_Reads, col = Collection)) +
     geom_point(alpha = .5) +
     theme_bw()
p2 = biases %>%
  ggplot(aes(x = Mean_GC, y = Total_insertions, col = Collection)) +
     geom_point(alpha = .5) +
     theme_bw()

p3 = biases %>%
  ggplot(aes(x = MEAN_DEPTH, y = Percent_TE_Reads, col = Collection)) +
     geom_point(alpha = .5) +
     theme_bw()

p4 = biases %>%
  ggplot(aes(x = MEAN_DEPTH, y = Total_insertions, col = Collection)) +
     geom_point(alpha = .5) +
     theme_bw() 

bloc = cowplot::plot_grid(p1 + theme(legend.position = "none"), p2+ theme(legend.position = "none"))
cowplot::plot_grid(bloc, get_legend(p3 + theme(legend.position = "bottom")), 
                   rel_heights = c(1, .2), ncol = 1)

bloc = cowplot::plot_grid(p3 + theme(legend.position = "none"), p4 + theme(legend.position = "none"))
cowplot::plot_grid(bloc, get_legend(p3 + theme(legend.position = "bottom")), 
                   rel_heights = c(1, .2), ncol = 1)

p1 = biases %>%
  filter(MEAN_DEPTH < 80) %>%
  ggplot(aes(x = MEAN_DEPTH, y = Total_insertions)) +
     geom_point(alpha = .5) +
     theme_bw()  +
     geom_smooth(method = "lm")
p2 = biases %>%
  filter(MEAN_DEPTH < 25) %>%
  ggplot(aes(x = MEAN_DEPTH, y = Total_insertions)) +
     geom_point(alpha = .5) +
     theme_bw()  +
     geom_smooth(method = "lm")
bloc = cowplot::plot_grid(p1 + theme(legend.position = "none"), p2+ theme(legend.position = "none"))
titre = ggplot() + geom_text(aes(x = 1, y = 1, label = "Depth bias is mostly found below 20 or 25."), size = 6) + theme_void()
cowplot::plot_grid(titre, bloc, ncol = 1, rel_heights = c(0.1, 1))

model = lm(Percent_TE_Reads ~ Mean_GC + MEAN_DEPTH, data = filter(biases, MEAN_DEPTH < 25))
summary(model)
## 
## Call:
## lm(formula = Percent_TE_Reads ~ Mean_GC + MEAN_DEPTH, data = filter(biases, 
##     MEAN_DEPTH < 25))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.3611  -1.7650   0.6579   2.1907  10.4482 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 41.67655    3.69856  11.268   <2e-16 ***
## Mean_GC     -0.72337    0.07950  -9.099   <2e-16 ***
## MEAN_DEPTH   0.36730    0.03447  10.655   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.572 on 513 degrees of freedom
##   (19 observations deleted due to missingness)
## Multiple R-squared:  0.2789, Adjusted R-squared:  0.2761 
## F-statistic:  99.2 on 2 and 513 DF,  p-value: < 2.2e-16
model = lm(Total_insertions ~ Mean_GC + MEAN_DEPTH, data = filter(biases, MEAN_DEPTH < 25))
round(summary(model)$coefficients, 4)
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) 176.5883    74.9782  2.3552   0.0189
## Mean_GC       0.4096     1.6119  0.2541   0.7995
## MEAN_DEPTH   11.5581     0.6957 16.6134   0.0000
rsq = round(summary(model)$r.squared, 2)

  
temp = biases %>%
  filter(MEAN_DEPTH < 25) %>%
  pivot_longer(cols = c(Total_insertions, Percent_TE_Reads), names_to = "Method", values_to = "TE_estimate") 
y_values = summarize(temp, max(TE_estimate))

ggplot(temp, aes(x = MEAN_DEPTH, y = TE_estimate)) +
     geom_point(alpha = .5) +
     theme_bw()  +
     geom_smooth(method = "lm") +
     facet_wrap(vars(Method), scales = "free")

Frequency of TE insertions between samples and populations

The non reference TIPs have some sort of support estimation. As a first step, I want to filter the ones that don’t seem reliable.

TE_insertions = bind_rows(read_tsv(paste0(TE_RIP_dir, "Non-ref_all_strains.bed"), 
           col_names = c("ID_file", "chromosome", "position", "end", "info", "score", "strand")) %>%
             mutate(ref_non_ref_TIP = "non_ref"),
          read_tsv(paste0(TE_RIP_dir, "Ref_all_strains.bed"), 
           col_names = c("ID_file", "chromosome", "position", "end", "info", "score", "strand")) %>%
             mutate(ref_non_ref_TIP = "ref")) %>%
  separate(col = chromosome, into = c("X1", "CHR", "X2", "X3", "X4"), sep = "_") %>%
  separate(col = info, into = c("TE_family", "TSD", "Allele_Frequency", "3_support", "5_support", "ref_reads"), sep = "\\|") %>%
  dplyr::select(-c(X1, X2, X3, X4, score, strand)) %>%
  left_join(Zt_meta %>% dplyr::select(ID_file, Continent, Cluster, Sampling_Date, Collection)) %>%
  filter(!is.na(Continent)) %>% filter(Continent != "Asia") %>%
  mutate(position = 100*trunc(position/100), end = 100*trunc(end/100)) %>%
  unite(TE_family, CHR, position, end, col = "TE_insertion", sep = ":", remove = F) %>%
  separate(TE_family, into = c("Superfamily", "TE_id"), sep = "_", remove = F, extra = "merge") %>%
  dplyr::mutate(Order = ifelse(grepl('^D',TE_family), "Class II (DNA transposons)", "Class I (retrotransposons)"))


threshold = 0.9
subset_nb = 10000
subset = slice_sample(TE_insertions, n = subset_nb)

nb_non_filtered = nrow(TE_insertions)
nb_filtered = nrow(filter(TE_insertions, is.na(Allele_Frequency) | Allele_Frequency > threshold))

temp = nrow(filter(subset, as.numeric(Allele_Frequency) < threshold))
ggplot(subset, aes(as.numeric(Allele_Frequency))) +
  geom_density() +
  geom_vline(aes(xintercept = threshold), color = "#82C0CC") +
  geom_label(x = 0.5, y = 100, fill = "white", 
            label = str_wrap(paste0((100*temp/subset_nb), "% TE with allelic frequency below ", threshold,
                             ". \n   The dataset goes from ", nb_non_filtered, " detected_TE_insertions to ",
                             nb_filtered, " after filtering."), 70)) +
  theme_bw() + 
  labs(x = "Allele Frequency of the TIPs PAV for each isolate",
       title = "Threshold for non-reference TIP filtering")

TE_insertions = filter(TE_insertions, is.na(Allele_Frequency) | Allele_Frequency > threshold)

Once, this is done, I want to explore some basic statistics about the TIPs. In many other studies, it seems that the TIPs SFS is biased toward lower frequency. As shown above, we have a depth bias here, so it is likely that this bias, if found in Z. tritici would be made even stronger by a non-detection artefact in low depth genomes. So in that context, here are some questions of interest: What are the frequency of the TE insertions we found? Are they shared by several samples?

TE_insertions_counts = TE_insertions %>%
  unite(Continent, ID_file, col = "for_display", remove = F) %>%
  dplyr::count(TE_insertion, ref_non_ref_TIP)


# SFS
cowplot::plot_grid(ggplot() + theme_void() + 
                     geom_text(aes(x = 1, y = 1, label = "Most TE insertions are found at low frequency."),
                               size = 6),
                   ggplot(TE_insertions_counts, aes(x = n)) +
                     geom_density( col = "#2EC4B6", fill = "#CBF3F0") +  theme_bw() +
                     labs(x = "TE insertion count"),
                   TE_insertions_counts %>%  
                     filter(n > 50) %>%
                     ggplot(aes(x = n)) +  geom_density(col = "#2EC4B6", fill = "#CBF3F0") +  theme_bw()+
                     labs(x = "TE insertion count (above 11)"),
                   rel_heights = c(0.4, 1, 1), ncol = 1)

# Doughnut chart of frequencies
TE_insertions_counts %>%
  mutate(for_bar = case_when(n == 1 ~ "01", 
                             n == 2 ~ "02",
                             n <= 10 ~ "10",
                             n <= 100 ~ "100",
                             n >= 100 ~ "100 +")) %>%
ggplot(aes(x = 1, fill=for_bar)) +
  geom_bar(position = "fill") +
  theme_bw()  +
  scale_fill_manual(values = c("#CBF3F0", "#2EC4B6", "#1F847A", "#1B746B", "#041F1E")) +
  labs(fill = "Number of strains",
       title = "Three quarter of all TE insertions are singletons.")

data = TE_insertions_counts %>%
  mutate(for_bar = case_when(n == 1 ~ "01", 
                             n == 2 ~ "02",
                             n <= 10 ~ "10",
                             n <= 100 ~ "100",
                             n >= 100 ~ "100 +")) %>%
  dplyr::count(for_bar)

data$fraction = data$n / sum(data$n)
data$ymax = cumsum(data$fraction) # Compute the cumulative percentages (top of each rectangle)
data$ymin = c(0, head(data$ymax, n=-1)) # Compute the bottom of each rectangle
data$labelPosition <- (data$ymax + data$ymin) / 2
data$label1 <- ifelse(round(data$fraction*100) > 2, paste0(round(data$fraction*100), "%"), "")
data$label2 <- ifelse(round(data$fraction*100) > 2, "", paste0(round(data$fraction*100), "%"))

ggplot(data, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=for_bar)) +
  geom_rect() +
  coord_polar(theta="y") + 
  xlim(c(2,4)) + 
  theme_void() +
  geom_text( x=3.5, aes(y=labelPosition, label=label1), size=4) +
  geom_text( x=4.2, aes(y=labelPosition, label=label2), size=4) +
  scale_fill_manual(values = c("#CBF3F0", "#2EC4B6", "#1F847A", "#1B746B", "#041F1E")) +
  labs(fill = "Number of strains",
       title = "Three quarter of all TE insertions are singletons.")

temp = TE_insertions_counts %>% 
  separate(TE_insertion, into = c("TE_family","CHR","Start", "End"), sep = ":", remove = F) %>%
  separate(TE_family, into = c("Superfamily", "TE_id"), sep = "_", remove = F, extra = "merge") %>%
  dplyr::mutate(Order = ifelse(grepl('^D',TE_family), "Class II (DNA transposons)", "Class I (retrotransposons)"),
                Length = abs(as.numeric(Start) - as.numeric(End)), 
                TIP_nb = n) %>%
  filter(Length < 2000, ref_non_ref_TIP == "ref") %>%
  mutate(for_bar = case_when(TIP_nb == 1 ~ "01", 
                             TIP_nb == 2 ~ "02",
                             TIP_nb <= 10 ~ "10",
                             TIP_nb <= 100 ~ "100",
                             TIP_nb >= 100 ~ "100 +")) 

ggplot(temp, aes(x = Length, y = for_bar, fill = for_bar)) +
  geom_density_ridges(alpha = .9) + 
  theme_bw() +
  scale_fill_manual(values = c("#CBF3F0", "#2EC4B6", "#1F847A", "#1B746B", "#041F1E")) +
  labs(x = "TIP frequency category", y = "Length of the TIP",
       title = "Length of the reference TIPs per frequency of the TIP")

Although most TIPs are found a low to very low frequency in our analysis, there are some that are identified in several samples. Among the TE insertions which are found multiple times, are the TIPs shared by isolates from different clusters or are they always from a single cluster?

theshold = 5

#Filtering only insertions found more than 5 times
insertions_per_pop = TE_insertions %>%
  filter(!is.na(Cluster)) %>%
  dplyr::count(Cluster, TE_insertion, ref_non_ref_TIP) %>%
  filter(n > theshold) %>%
  pivot_wider(names_from = Cluster, values_from = n) %>%
  mutate(n = 1) %>%
  dplyr::rowwise() %>%
  dplyr::mutate(NA_count = sum(is.na(c(V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11))),
                Nb_population = 11 - NA_count)


#Number of TIPS shared by populations
temp2 = insertions_per_pop %>% 
  dplyr::count(Nb_population, name = "Insertion_number") %>%
  mutate(Insertion_type = ifelse(Nb_population == 1, "Population specific", 
                                 ifelse(Nb_population == 11, "Shared by all populations", "Intermediate")),
         label = ifelse(Nb_population == 1, paste0("N=", Insertion_number), 
                                 ifelse(Nb_population == 11, paste0("N=", Insertion_number), "")))
prop_non_spe = round(sum(temp2$Insertion_number[temp2$Insertion_type != "Population specific"])/sum(temp2$Insertion_number)*100)

ggplot(temp2, aes(x = reorder(as.character(Nb_population), Nb_population), 
                  y = Insertion_number, fill = Insertion_type)) + 
  geom_bar(stat = "identity") +
  geom_text(aes(y = Insertion_number + 30, label = label), size=3) +
  geom_label(x = 7, y = max(temp2$Insertion_number)/2, 
            label = str_wrap(paste0(prop_non_spe, "% of insertions are found in two or more populations"), 20),
            fill = "white") + 
  theme_bw() + 
  scale_fill_manual(values = c("grey", "#ff9f1c", "#2ec4b6")) +
  labs(x = "Number of populations displaying an insertion", 
       y = "Number of TE insertions",
       title = str_wrap(paste0("Two thirds of TE insertions found in more than ", 
                               theshold, " isolates are population-specific."),55))

#Numbers of TIPS shared by population: ref vs non-ref
p1 = insertions_per_pop %>% 
  dplyr::count(Nb_population, name = "Insertion_number") %>%
  mutate(Insertion_type = ifelse(Nb_population == 1, "Population specific", 
                                 ifelse(Nb_population == 11, "Shared by all populations", "Intermediate")),
         label = ifelse(Nb_population == 1, paste0("N=", Insertion_number), 
                                 ifelse(Nb_population == 11, paste0("N=", Insertion_number), ""))) %>%
  ggplot(aes(x = reorder(as.character(Nb_population), Nb_population), 
                  y = Insertion_number, fill = Insertion_type)) + 
  geom_bar(stat = "identity") +
  geom_text(aes(y = Insertion_number + 30, label = label), size=3) + 
  theme_bw() +
  scale_fill_manual(values = c("grey", "#ff9f1c", "#2ec4b6")) +
  labs(x = "Number of populations displaying an insertion", 
       y = "Number of reference TIP")
p2 = insertions_per_pop %>% 
  filter(ref_non_ref_TIP != "ref") %>%
  dplyr::count(Nb_population, name = "Insertion_number") %>%
  mutate(Insertion_type = ifelse(Nb_population == 1, "Population specific", 
                                 ifelse(Nb_population == 11, "Shared by all populations", "Intermediate")),
         label = ifelse(Nb_population == 1, paste0("N=", Insertion_number), 
                                 ifelse(Nb_population == 11, paste0("N=", Insertion_number), ""))) %>%
  ggplot(aes(x = reorder(as.character(Nb_population), Nb_population), 
                  y = -Insertion_number, fill = Insertion_type)) + 
  geom_bar(stat = "identity") +
  geom_text(aes(y = -Insertion_number - 30, label = label), size=3)  + 
  theme_bw() +
  scale_fill_manual(values = c("grey", "#ff9f1c", "#2ec4b6")) +
  labs(x = "Number of populations displaying an insertion", 
       y = "Number of non-reference TIP")

cowplot::plot_grid(p2 + coord_flip() + theme(legend.position = "none",
                                             plot.margin = unit(c(0, 0, 0, 0), "cm")), 
                   p1 + coord_flip() + theme(legend.position = "none",
                                             plot.margin = unit(c(0, 0, 0, 0), "cm"),
                                             axis.title.y = element_blank(),
                                             axis.text.y = element_blank(),
                                             axis.ticks.y = element_blank()),
                   align = "hv")

non_pop_spe = insertions_per_pop %>% filter(NA_count < 10)
nb_per_cluster = dplyr::count(Zt_meta, Cluster, name = "Nb_in_cluster")


#Number of population-specific TIPs per population
insertions_per_pop %>% 
  filter(Nb_population == 1) %>%
  dplyr::select(-NA_count, -Nb_population) %>%
  pivot_longer(cols = -c(TE_insertion, n, ref_non_ref_TIP), names_to = "Cluster", values_to = "Nb_present") %>%
  filter(!is.na(Nb_present)) %>%
  dplyr::count(Cluster, name = "Nb_cluster_specific") %>%
  full_join(nb_per_cluster) %>%
  ggplot(aes(x = Nb_in_cluster, y = Nb_cluster_specific, col = Cluster, label = Cluster)) +
   geom_point() +
   geom_label_repel(box.padding = 0.35, point.padding = 0.5,
                  segment.color = 'grey50') +
   theme_bw() +
   Color_Cluster + 
   labs(x = "Number of isolate per population", y = "Number of population-specific insertion",
        title = "More TE insertions are detected in heavily sampled populations.")

So far, I’ve looked at TIPs from all TE categories together. I’m curious to see how it matches with the superfamilies and orders of TEs.

temp2 = insertions_per_pop %>%
  mutate(Insertion_type = ifelse(Nb_population == 1, "Population specific",
                                 ifelse(Nb_population == 11, "Shared by all populations", "Intermediate"))) %>%
  dplyr::select(TE_insertion, Insertion_type)

#TE insertions per Superfamily and frequency
full_join(TE_insertions_counts, temp2) %>% 
  separate(TE_insertion, into = c("TE_family","CHR","Start", "End"), sep = ":", remove = F) %>%
  separate(TE_family, into = c("Superfamily", "TE_id"), sep = "_", remove = F, extra = "merge") %>%
  dplyr::mutate(Order = ifelse(grepl('^D',TE_family), "Class II (DNA transposons)", "Class I (retrotransposons)"),
                Length = abs(as.numeric(Start) - as.numeric(End)), 
                TIP_nb = n) %>%
  dplyr::count(Order, Superfamily, Insertion_type, TIP_nb) %>%
  mutate(for_bar = case_when(TIP_nb == 1 ~ "01", 
                             TIP_nb == 2 ~ "02",
                             TIP_nb <= 10 ~ "10",
                             TIP_nb <= 100 ~ "100",
                             TIP_nb >= 100 ~ "100 +")) %>%
  ggplot(aes(x = Superfamily, y = n, fill = for_bar)) +
  geom_bar(stat= "identity") + 
  theme_bw() +
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1)) + 
  facet_wrap(vars(Order), scales = "free_x")+
  scale_fill_manual(values = c("#CBF3F0", "#2EC4B6", "#1F847A", "#1B746B", "#041F1E")) 

#TE insertions per Superfamily and ref/non-ref 
#The insertions per pop were filtered for only TIP higher than 5, so temp2 and the following plot as well
right_join(TE_insertions_counts, temp2) %>% 
  separate(TE_insertion, into = c("TE_family","CHR","Start", "End"), sep = ":", remove = F) %>%
  separate(TE_family, into = c("Superfamily", "TE_id"), sep = "_", remove = F, extra = "merge") %>%
  dplyr::mutate(Order = ifelse(grepl('^D',TE_family), "Class II (DNA transposons)", "Class I (retrotransposons)"),
                Length = abs(as.numeric(Start) - as.numeric(End))) %>%
  dplyr::count(Order, Superfamily, Insertion_type, ref_non_ref_TIP) %>%
  ggplot(aes(x = Superfamily, y = n, fill = Insertion_type)) +
  geom_bar(stat= "identity") + 
  theme_bw() +
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1)) + 
  facet_wrap(vars(Order, ref_non_ref_TIP), scales = "free")+ 
  scale_fill_manual(values = c("grey", "#ff9f1c", "#2ec4b6"))

TE across space: population differentiation

TE content per population and per continent

Let’s compare the TE content per population and per continent. First, I will make the comparison using the read mapping method.

### Plot
#Building the basic violin plot per continent


#Continents
model = lm(Percent_TE_Reads ~ Continent + Collection,
          data=TE_qty %>%  filter(!is.na(Continent), !is.na(Cluster)))
CLD = cld(lsmeans(model, ~ Continent),  alpha   = 0.05, Letters = letters, adjust  = "sidak") 
CLD$.group=gsub(" ", "", CLD$.group)
TE_prop = ggplot(TE_qty %>%  filter(!is.na(Continent), !is.na(Cluster))) +
  geom_hline(aes(yintercept = world_avg),
             color = "gray30", size = 0.6, linetype = "dashed")  +
  theme_cowplot()  +
  theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
  labs (x = "", y = str_wrap("Percentage of reads", width = 30),
        subtitle = str_wrap(paste(""), width = 70)) +
  geom_text(data = CLD, aes(x = Continent, label = .group, y = 34), color   = "black")+
  geom_violin(aes(x = Continent, y = Percent_TE_Reads, fill = Continent), alpha = .8) +
  Fill_Continent + Color_Continent +
  stat_summary(aes(x = Continent, y = Percent_TE_Reads), fun = mean, geom = "point", size = 2, color = "grey30") +
  labs(title = "Amount of reads mapping on TE consensus per continent")
TE_prop

#Clusters
model = lm(Percent_TE_Reads ~  Cluster + Collection, data=TE_qty)
CLD = cld(lsmeans(model, ~ Cluster),  alpha   = 0.05, Letters = letters, adjust  = "sidak") 
CLD$.group=gsub(" ", "", CLD$.group)
TE_qty %>%  
  filter(!is.na(Continent), !is.na(Cluster)) %>%
  ggplot() +
    geom_hline(aes(yintercept = world_avg),
               color = "gray30", size = 0.6, linetype = "dashed")  +
    theme_cowplot()  +
    theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
    labs (x = "", y = str_wrap("Percentage of reads", width = 30),
          subtitle = str_wrap(paste(""), width = 70)) +
    geom_text(data = CLD, aes(x = Cluster, label = .group, y = 34), color   = "black")+
    geom_violin(aes(x = Cluster, y = Percent_TE_Reads, fill = Cluster), alpha = .8) +
    stat_summary(aes(x = Cluster,  y = Percent_TE_Reads), fun = mean, geom = "point", size = 2, color = "grey30") +
    labs(title = "Amount of reads mapping on TE consensus per cluster") +
    Fill_Cluster

The statistics used here are a one-way ANOVA with block. Blocks are used in an analysis of variance or similar models in order to account for suspected variation from factors other than the treatments or main independent variables being investigated. Here I considered the collection as the confounding factor. It definitely has an effect and was thus accounted for in the statistics related to TE content and to RIP level.

Genomes from isolates in Oceania and the Americas seem to contain more TE than those from the Middle-East, in particular.

Let’s see if the same result is obtained with the TE insertions as detected by ngs_te_mapper2.

TE_insertion_wrld_average = TE_qty  %>%
  dplyr::summarize(avg = mean(as.numeric(Total_insertions), na.rm = T)) %>%
  pull(avg)


#Continents
model = lm(Total_insertions ~  Continent + Collection, data=TE_qty)
CLD = cld(lsmeans(model, ~ Continent), alpha   = 0.05, Letters = letters, adjust  = "sidak")
CLD$.group=gsub(" ", "", CLD$.group)
TE_qty %>%
  filter(!is.na(Continent)) %>%
  ggplot(aes(x = Continent, y = Total_insertions, fill = Continent)) +
    geom_violin(alpha = .8)  +
  geom_text(data = CLD, aes(x = Continent, label = .group, y = 750), color   = "black") +
  Fill_Continent + Color_Continent +
  theme_cowplot()  +
    theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
  labs (x = "", y = str_wrap("Number of insertions per sample", width = 30),
        title = "TE insertions per continent",
        subtitle = str_wrap(paste(""), width = 70)) +
  stat_summary(aes(x = Continent,  y = Total_insertions), fun = mean, geom = "point", size = 2, color = "grey30") +
  geom_hline(aes(yintercept = TE_insertion_wrld_average), color = "gray30", size = 0.6, linetype = "dashed")

#Cluster
model = lm(Total_insertions ~  Cluster, data=TE_qty)
CLD = cld(lsmeans(model, ~ Cluster), alpha   = 0.05, Letters = letters, adjust  = "sidak")
CLD$.group=gsub(" ", "", CLD$.group)

TE_qty %>%
  filter(!is.na(Cluster)) %>%
  ggplot(aes(x = Cluster, y = Total_insertions, fill = Cluster)) +
    geom_violin(alpha = .8)  +
  theme_cowplot()  +
    theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
  labs (x = "", y = str_wrap("Number of insertions per sample", width = 30),
        title = "TE insertions per cluster",
        subtitle = str_wrap(paste(""), width = 70)) +
  geom_text(data = CLD, aes(x = Cluster, label = .group, y = 750), color   = "black") +
  Fill_Cluster +
  stat_summary(aes(x = Cluster,  y = Total_insertions), fun = mean, geom = "point", size = 2, color = "grey30") +
  geom_hline(aes(yintercept = TE_insertion_wrld_average), color = "gray30", size = 0.6, linetype = "dashed")

The pattern is somewhat different with the TE insertions. In this case, the Oceania samples don’t look particularly high. However, the pattern with the African and Middle-Eastern samples being lower is even clearer.

ngs_te_mapper2 distinguishes between reference and non-reference insertions. I am curious to see how they differ, so I create the same plot but with the two values separately.

p1 = TE_qty %>%
  filter(!is.na(Continent) & Continent != "Asia") %>%
  ggplot(aes(x = Continent, y = Non_ref_insertions, fill = Continent)) +
    geom_violin(alpha = .8)  +
  Fill_Continent + Color_Continent +
  theme_cowplot()  +
    theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
  labs (x = "", y = str_wrap("Non-reference insertions", width = 30))
p2 = TE_qty %>%
  filter(!is.na(Continent) & Continent != "Asia") %>%
  ggplot(aes(x = Continent, y = Ref_insertions, fill = Continent)) +
    geom_violin(alpha = .8)  +
  Fill_Continent + Color_Continent +
  theme_cowplot()  +
    theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
  labs (x = "", y = str_wrap("Reference insertions", width = 30))

cowplot::plot_grid(p1 + coord_flip() + theme(legend.position = "none"),
                   p2 + coord_flip(), rel_widths = c(1, 1.9))

Aside from the repeating pattern of low TE content in the Middle-East/Africa, the most striking difference is the European samples containing more reference insertions. Considering, that the reference genome is a sample isolated from Denmark, it is expected that other European samples would be closer.

I know for sure that my estimates are biased, either because of depth, GC content, or both. So I want to make some more checks to ensure that I am confident in the results I show. First, I want to try and compare the clusters intra-collection to ensure that the results observed previously can be confirmed within each collection.

temp = TE_qty %>%
  filter(Cluster != "NA") %>%
  dplyr::count(Collection, Cluster) %>%
  mutate(n = ifelse(n < 5, NA, n)) %>%
  pivot_wider(names_from = Cluster, values_from = n)%>%
  rowwise() %>%
  dplyr::mutate(NA_count = sum(is.na(c(V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11))))
                
temp
## # A tibble: 12 x 13
## # Rowwise: 
##    Collection    V10   V11    V2    V5    V6    V7    V8    V9    V4    V1    V3
##    <chr>       <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
##  1 ETHZ_2020      24    14    NA    22    14     7     8    12    NA    NA    NA
##  2 Hartmann_F…    50    NA    22    NA    25    NA    NA    NA    27    NA    NA
##  3 Hartmann_O…    94    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
##  4 JGI             6    NA    13    NA    NA    NA    10    NA    NA    NA    NA
##  5 JGI_2          NA    NA    11    NA    NA    NA    NA    NA    NA    NA    NA
##  6 JGI_3          NA    NA     8    NA    NA     6    NA    NA    NA    NA    NA
##  7 JGI_4          NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
##  8 JGI_Thierry    NA     7    11    NA    NA    NA    NA    NA    NA    NA    NA
##  9 MMcDonald_…    NA    NA    NA    NA    NA    NA    NA    NA    NA    16    31
## 10 Syngenta       NA    NA   267    NA    NA    NA    NA    NA    NA    NA    NA
## 11 Third_batc…    NA    NA     8    NA    NA    NA    NA    NA    NA    NA    NA
## 12 <NA>           NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## # … with 1 more variable: NA_count <int>
collections_multiple = temp %>% 
  filter(NA_count <= 9) %>% 
  pivot_longer(cols = -c(Collection, NA_count), names_to = "Cluster", values_to = "n") %>% 
  filter(!is.na(n))

inner_join(TE_qty, dplyr::select(collections_multiple, Collection, Cluster)) %>%  
ggplot() +
  theme_cowplot()  +
  theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
  labs (x = "", y = str_wrap("Percentage of reads", width = 30),
        subtitle = str_wrap(paste(""), width = 70)) +
  geom_boxplot(aes(x = Cluster, y = Percent_TE_Reads, fill = Cluster), alpha = .8) +
  Fill_Cluster + Color_Cluster +
  labs(title = "Amount of reads mapping on TE consensus  per collection per cluster") + 
  facet_wrap(vars(Collection), scales = "free")

inner_join(TE_qty, dplyr::select(collections_multiple, Collection, Cluster)) %>%  
ggplot() +
  theme_cowplot()  +
  theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
  labs (x = "", y = str_wrap("Percentage of reads", width = 30),
        subtitle = str_wrap(paste(""), width = 70)) +
  geom_boxplot(aes(x = Cluster, y = Total_insertions, fill = Cluster), alpha = .8) +
  Fill_Cluster + Color_Cluster +
  labs(title = "TE insertions per collection per cluster") + 
  facet_wrap(vars(Collection), scales = "free")

Second, I check whether a plot and statistics including the depth of coverage still recovers the difference between groups that I have observed above.

threshold = 10 #Number of minimum presence count

# 
temp = inner_join(filter(TE_insertions_counts, n > 10), TE_insertions) %>%
  inner_join(dplyr::select(non_pop_spe, TE_insertion)) %>%
  group_by(ID_file, Continent) %>%
  dplyr::count() %>%
  inner_join(depth, by = c("ID_file" = "INDV")) 

ggplot(temp, aes(x = MEAN_DEPTH, y = n, col = Continent)) +
     geom_point(alpha = .5) +
     theme_bw() +
     labs(title = "Depth bias check after TIP comparison between continent",
          subtitle = "Only non-population specific",
          x = "Mean depth of coverage", 
          y = paste0("Number of TIP with more than ", threshold, " presence")) +
     Color_Continent

ggplot(temp, aes(x = MEAN_DEPTH, y = n, col = Continent)) +
     geom_point(alpha = .5) +
     theme_bw() +
     labs(title = "Depth bias check after TIP comparison between continent",
          subtitle = "Only non-population specific",
          x = "Mean depth of coverage", 
          y = paste0("Number of TIP with more than ", threshold, " presence")) +
     Color_Continent +
  geom_smooth(method = "glm", formula = y~log(x), se = F, 
                      method.args = list(family = gaussian(link = 'log')))

# 
inner_join(filter(TE_insertions_counts, n > threshold), TE_insertions) %>%
  group_by(ID_file, Continent, Cluster) %>%
  dplyr::count() %>%
  inner_join(depth, by = c("ID_file" = "INDV")) %>%
  ggplot(aes(x = MEAN_DEPTH, y = n, col = Cluster, shape = Cluster)) +
     geom_point(alpha = .5) +
     theme_bw() +
     labs(title = "Depth bias check TIP comparison between continent",
          subtitle = "All TIPs (population specific and others)",
          x = "Mean depth of coverage", 
          y = paste0("Number of TIP with more than ", threshold, " presence")) +
     scale_shape_manual(values = c(0,1,2,3,4,5,6,7,8,9,10)) + 
     Color_Cluster #+

#    geom_smooth(method = "lm", se = F)
inner_join(filter(TE_insertions_counts, n > threshold), TE_insertions) %>%
  group_by(ID_file, Continent, Cluster) %>%
  dplyr::count() %>%
  inner_join(depth, by = c("ID_file" = "INDV")) %>%
  ggplot(aes(x = MEAN_DEPTH, y = n, col = Cluster, shape = Cluster)) +
     geom_point(alpha = .5) +
     theme_bw() +
     labs(title = "Depth bias check TIP comparison between continent",
          subtitle = "All TIPs (population specific and others)",
          x = "Mean depth of coverage", 
          y = paste0("Number of TIP with more than ", threshold, " presence")) +
     scale_shape_manual(values = c(0,1,2,3,4,5,6,7,8,9,10)) + 
     Color_Cluster 

model = lm(n ~ Continent + MEAN_DEPTH, data = temp)
summary(model)
## 
## Call:
## lm(formula = n ~ Continent + MEAN_DEPTH, data = temp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -218.540  -25.645    7.865   26.356   76.783 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            110.11554    6.02568  18.274  < 2e-16 ***
## ContinentEurope         43.05024    6.53690   6.586 7.97e-11 ***
## ContinentMiddle East    -8.88905    7.08540  -1.255     0.21    
## ContinentNorth America  59.96574    6.34281   9.454  < 2e-16 ***
## ContinentOceania        62.99936    6.87559   9.163  < 2e-16 ***
## ContinentSouth America  63.78178    7.71384   8.268 5.28e-16 ***
## MEAN_DEPTH               2.58642    0.08561  30.213  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35.56 on 840 degrees of freedom
## Multiple R-squared:  0.6978, Adjusted R-squared:  0.6957 
## F-statistic: 323.3 on 6 and 840 DF,  p-value: < 2.2e-16

Non-guided clustering

We can see that different clusters and continent have a different overall TE content. However, I am curious to see if there is a geographical clustering intrinsic to the TE content that can be uncovered without an a priori sorting of samples.

First, I will use the ngs_te_mapper2 results, and use each insertion identified in more than 5 samples to create different clustering. First, I’ll simply create a heatmap, then I’ll move on to PCAs, first by cluster, then with each isolate.

matrice = TE_insertions %>%
  #filter(!is.na(Cluster)) %>%
  inner_join(dplyr::select(non_pop_spe, TE_insertion)) %>%
  inner_join(filter(TE_insertions_counts, n > 5))%>%
  dplyr::count(TE_insertion, Cluster) %>%
  left_join(nb_per_cluster) %>%
  mutate(Freq = n/Nb_in_cluster) %>%
  dplyr::select(Cluster, TE_insertion, Freq) %>%
  pivot_wider(names_from = TE_insertion, values_from = Freq, values_fill = 0) 


#Heatmap per genetic cluster
temp = as.matrix(matrice[,c(3:ncol(matrice))])[, which(apply(as.matrix(matrice[,c(3:ncol(matrice))]), 2, var) != 0)]

rownames(temp) = matrice$Cluster
pheatmap(temp, show_colnames = F)

#PCA per cluster
TE.pca = prcomp(temp, center = TRUE,scale. = TRUE)

temp = as.tibble(cbind(matrice, as.data.frame(TE.pca$x))) %>%
  dplyr::select(Cluster, PC1, PC2, PC3, PC4, PC5, PC6, PC7, PC8)

p1 = ggplot(temp, aes(x = PC1, y = PC2, label = Cluster, col = Cluster)) +
  geom_text() +
  Color_Cluster +
  theme_bw() + theme(legend.position = "none")
p2 = ggplot(temp, aes(x = PC3, y = PC4, label = Cluster, col = Cluster)) +
  geom_text() +
  Color_Cluster +
  theme_bw()+ theme(legend.position = "none")
p3 = ggplot(temp, aes(x = PC5, y = PC6, label = Cluster, col = Cluster)) +
  geom_text() +
  Color_Cluster +
  theme_bw()+ theme(legend.position = "none")
p4 = ggplot(temp, aes(x = PC7, y = PC8, label = Cluster, col = Cluster)) +
  geom_text() +
  Color_Cluster +
  theme_bw()+ theme(legend.position = "none")
cowplot::plot_grid(p1, p2, p3, p4)

#PCA per isolate
matrice = TE_insertions %>%
  filter(!is.na(Cluster)) %>%
  inner_join(filter(TE_insertions_counts, n > 10)) %>%
  dplyr::count(TE_insertion, ID_file, Cluster) %>%
  group_by(ID_file) %>%
  mutate(Count_strain = sum(n)) %>%
  pivot_wider(names_from = TE_insertion, values_from = n, values_fill = 0) 


temp = as.matrix(matrice[,c(3:ncol(matrice))])[, which(apply(as.matrix(matrice[,c(3:ncol(matrice))]), 2, var) != 0)]

TE.pca = prcomp(temp, center = TRUE, scale. = TRUE)

#And now all against all but not interactive
temp = as.tibble(cbind(matrice, as.data.frame(TE.pca$x))) %>%
  dplyr::select(ID_file, Cluster, PC1, PC2, PC3, PC4, PC5, PC6, PC7, PC8)

p1 = ggplot(temp, aes(x = PC1, y = PC2, col = Cluster)) +
  geom_point() +
  Color_Cluster +
  theme_bw() + theme(legend.position = "none")
p2 = ggplot(temp, aes(x = PC3, y = PC4, col = Cluster)) +
  geom_point() +
  Color_Cluster +
  theme_bw() + theme(legend.position = "none")
p3 = ggplot(temp, aes(x = PC5, y = PC6, col = Cluster)) +
  geom_point() +
  Color_Cluster +
  theme_bw() + theme(legend.position = "none")
p4 = ggplot(temp, aes(x = PC7, y = PC8, col = Cluster)) +
  geom_point() +
  Color_Cluster +
  theme_bw() + theme(legend.position = "none")
cowplot::plot_grid(p1, p2, p3, p4)

It does look like there is clustering of samples according to geography. This is interesting as it shows that the types of TEs are different in different populations.

temp = TE_insertions %>%
  filter(Allele_Frequency >= threshold | is.na(Allele_Frequency))  %>%
  dplyr::count(Continent, Order, Superfamily) %>%
  group_by(Continent, Order) %>%
  mutate(Count_Order = sum(n)) %>%
  group_by(Continent) %>%
  mutate(Count_Continent = sum(n)) %>%
  ungroup() %>%
  mutate(Prop_superfamily = n/Count_Continent, 
         Prop_Order = n/Count_Continent) 

ggplot(temp, aes(x = Continent, y = Prop_superfamily, fill = Superfamily)) +
  geom_bar(stat = "identity") +
  theme_bw() +
  facet_grid(rows = vars(Order))

temp = TE_insertions %>%
  filter(Allele_Frequency >= threshold | is.na(Allele_Frequency))  %>%
  dplyr::count(Continent, ref_non_ref_TIP) %>%
  group_by(Continent, ref_non_ref_TIP) %>%
  mutate(Count_type = sum(n)) %>%
  group_by(Continent) %>%
  mutate(Count_Continent = sum(n)) %>%
  ungroup() %>%
  mutate(Prop_type  = n/Count_Continent) 
ggplot(temp, aes(x = Continent, y = Prop_type, fill = ref_non_ref_TIP)) +
  geom_bar(stat = "identity") +
  theme_bw() 

Let’s do the same with the mapping TE content method! In this case, I don’t know which reads belong to which exact copy of a TE, but I can look at the reads aligning on each consensus sequence. I will thus make a PCA based on the proportion of reads mapping on each TE.

reads_per_TE = read_delim(paste0(RIP_DIR, "Nb_reads_per_TE.txt"), delim = "\t",
                    col_names = c("ID_file", "TE", "Length",
                                  "# mapped read-segments",  "# unmapped read-segments")) %>%
  filter(TE != "*") %>%
  separate(TE, into = c("Superfamily", "TE_id"), sep = "_", remove = F, extra = "merge") %>%
  dplyr::mutate(Order = ifelse(!grepl('^D',TE), "Class II (DNA transposons)", "Class I (retrotransposons)")) %>%
  left_join(Zt_meta %>% dplyr::select(ID_file, Collection, Country, Continent)) %>%
  unite(Continent, Country, ID_file, col = "for_display", remove = F)

temp = reads_per_TE %>% group_by(ID_file) %>%
       dplyr::summarise(Reads_mapped_per_TE = sum(`# mapped read-segments`))

reads_per_TE = left_join(reads_per_TE, temp) %>%
  dplyr::mutate(Normalized_nb_reads_mapped = `# mapped read-segments` / Reads_mapped_per_TE)



TE_PCA_mat = reads_per_TE %>%
  dplyr::select(ID_file, Continent, Collection, for_display, TE, Normalized_nb_reads_mapped) %>%
  spread(key = TE, value = as.numeric(Normalized_nb_reads_mapped))

temp = as.matrix(TE_PCA_mat[,c(5:ncol(TE_PCA_mat))])[, which(apply(as.matrix(TE_PCA_mat[,c(5:ncol(TE_PCA_mat))]), 2, var) != 0)]

TE.pca = prcomp(temp, center = TRUE,scale. = TRUE)


temp = as.tibble(cbind(TE_PCA_mat, as.data.frame(TE.pca$x))) %>%
  dplyr::select(for_display, Continent, PC1, PC2, PC3, PC4)

p = ggpairs(temp, columns = c(3:6), ggplot2::aes(col=Continent, fill = Continent, alpha = 0.6),
            title = "PCA based on normalized reads mapping on each TE consensus",
            upper = list(continuous = "points", combo = "box_no_facet"))

for(i in 1:p$nrow) {
  for(j in 1:p$ncol){
    p[i,j] <- p[i,j] + theme_bw() + Color_Continent +Fill_Continent
  }
}

p

#And now all against all but not interactive
temp = as.tibble(cbind(TE_PCA_mat, as.data.frame(TE.pca$x))) %>%
  dplyr::select(for_display, Collection, PC1, PC2, PC3, PC4)

p = ggpairs(temp, columns = c(3:6), ggplot2::aes(col=Collection, fill = Collection, alpha = 0.6),
            title = "PCA based on normalized reads mapping on each TE consensus",
            upper = list(continuous = "points", combo = "box_no_facet"))

for(i in 1:p$nrow) {
  for(j in 1:p$ncol){
    p[i,j] <- p[i,j] + theme_bw()
  }
}

p


TE insertions per chromosomes

I want to gain some insights about where, in the genome, are these TIPs we have detected: are they on specific chromosomes? Is there a difference between core and accessory chromosomes?

I’ll consider the number of TIP and normalize it by length to take into account the widely different chromosome size.

chromosomes_lengths = readxl::read_excel(paste0(metadata_dir, "Chromosomes_cumulative_lengths.xlsx")) 

p1 = TE_insertions %>%
  dplyr::count(Order, CHR, ID_file) %>%
  inner_join(dplyr::select(chromosomes_lengths, CHR, Length) %>% mutate(CHR = as.character(CHR))) %>%
  ggplot(aes(x = reorder(as.character(CHR), as.numeric(CHR)), y = n/(Length/1000000))) + 
    geom_boxplot() + 
    theme_bw() + 
  theme(axis.title = element_text(size = 10))+
  facet_wrap(vars(Order), scales = "free") +
  labs(x = "", y = "Per-sample insertions per Mb")

p2 = TE_insertions %>%
  dplyr::count(CHR, ref_non_ref_TIP, Order) %>%
  inner_join(dplyr::select(chromosomes_lengths, CHR, Length) %>% mutate(CHR = as.character(CHR))) %>%
  ggplot(aes(x = reorder(as.character(CHR), as.numeric(CHR)), y = n/(Length/1000000), fill = ref_non_ref_TIP)) + 
    geom_bar(stat = "identity") + 
    theme_bw() +
  facet_wrap(vars(Order), scales = "free") +
  scale_fill_manual(values = c("#FFD399", "#cbf3f0")) + 
  theme(legend.position = "bottom", axis.title = element_text(size = 10)) +
  labs(x = "", y = "Number insertions per Mb", fill = "Insertion type")

cowplot::plot_grid(p1, p2, ncol = 1, rel_heights = c(1, 1.2), align = "hv")

It looks like the accessory chromosomes might be a bit higher than the core chromosome. I also want to get finer in the scale: are there places in the genome which tend to have more TIPs than others?

TE_insertions_per_window = TE_insertions %>%
  mutate(CHR = as.numeric(CHR),
         window = trunc(((position + end)/2)/10000)) %>%
  dplyr::select(TE_insertion, CHR, window) %>%
  distinct() %>%
  group_by(CHR, window) %>%
  dplyr::count() 

summar = ungroup(TE_insertions_per_window) %>% 
  summarise(median_TE = median(n),
            sd_TE = sd(n))

TE_insertions_per_window  = TE_insertions_per_window %>%
  #inner_join(summar) %>%
  mutate(outlier = ifelse(n >= summar$median_TE + 3*summar$sd_TE, "outlier", "non_outlier")) 

TE_insertions_per_window %>%
  ungroup() %>%
  mutate(Start = window*10000, End = Start + 10000) %>%
  dplyr::select(-window, TIP_count = n, TIP_category = outlier) %>%
  write_tsv(file = paste0(TE_RIP_dir, "Ztritici_global_March2021.good_samples.10kb_windows.TIP_counts.tab"))

filter(TE_insertions_per_window, as.numeric(CHR) < 8) %>%
  ggplot(aes(x = window, y = n, col = as.character(outlier))) +
    theme_bw() + 
    theme(legend.position = "none") +
    scale_color_manual(values = c("#f5cac3", "#f28482")) +
    geom_point(alpha = .8) +
    geom_hline(yintercept = summar$median_TE + 2*summar$sd_TE,
               linetype="dashed", col = "grey80") +
    facet_grid(rows = vars(CHR), scales = "free_x") 

filter(TE_insertions_per_window, as.numeric(CHR) >= 8 & as.numeric(CHR) < 14) %>%
  ggplot(aes(x = window, y = n, col = outlier)) +
    theme_bw() + 
    theme(legend.position = "none") +
    scale_color_manual(values = c("#f5cac3", "#f28482")) +
    geom_point(alpha = .8) +
    geom_hline(yintercept = summar$median_TE + 2*summar$sd_TE,
               linetype="dashed", col = "grey80") +
    facet_grid(rows = vars(CHR), scales = "free_x")

filter(TE_insertions_per_window, as.numeric(CHR) >= 14) %>%
  ggplot(aes(x = window, y = n, col = outlier)) +
    theme_bw() + 
    theme(legend.position = "none") +
    scale_color_manual(values = c("#f5cac3", "#f28482")) +
    geom_point(alpha = .8) +
    geom_hline(yintercept = summar$median_TE + 2*summar$sd_TE,
               linetype="dashed", col = "grey80") +
    facet_grid(rows = vars(CHR), scales = "free_x")

label1 = ungroup(TE_insertions_per_window) %>%
  filter(!is.na(CHR)) %>%
  dplyr::summarize(Average = mean(n)) %>% pull()
p1 = ungroup(TE_insertions_per_window) %>%
  filter(!is.na(CHR)) %>%
  group_by(CHR) %>%
  dplyr::summarize(Average = mean(n)) %>%
  dplyr::mutate(Chromosome_type = ifelse(CHR < 14, "core", "accessory")) %>%
  ggplot(aes(x = reorder(as.character(CHR), CHR), y = Average, fill = Chromosome_type)) +
  geom_bar(stat = "identity") +
  geom_hline(yintercept = label1, linetype = "dashed", col = "grey40") + 
  theme_bw() + 
  scale_fill_manual(values = c("#2ec4b6","#cbf3f0")) +
  labs(title = paste0("GW TIP number average is ", round(label1, 2)), 
       x = "Chromosome", y = "Mean # TIPs per 10kb") +
  theme(legend.position = "none")


p2 = ungroup(TE_insertions_per_window) %>%
  dplyr::count(CHR, outlier) %>%
  group_by(CHR) %>%
  dplyr::mutate(Tot_window = sum(n)) %>%
  mutate(prop = 100*n/Tot_window) %>%
  mutate(Chromosome_type = ifelse(CHR < 14, "core", "accessory")) %>%
  filter(outlier == "outlier") %>%
  ggplot(aes(x = reorder(as.character(CHR), CHR), y = prop, fill = Chromosome_type)) +
  geom_bar(stat = "identity") +
  theme_bw() + 
  scale_fill_manual(values = c("#2ec4b6","#cbf3f0")) +
  labs(title = "Proportion of outlier windows", 
       x = "Chromosome", y = "% outlier windows") +
  theme(legend.position = "none") +
  geom_text(aes(y = 22, label = paste0(round(prop, 2), "%")), size = 3)

cowplot::plot_grid(p1 + coord_flip(), p2 + coord_flip())

ungroup(TE_insertions_per_window) %>%
  dplyr::count(CHR, outlier) %>%
  group_by(CHR) %>%
  dplyr::mutate(Tot_window = sum(n)) %>%
  mutate(prop = 100*n/Tot_window) %>%
  filter(outlier == "outlier")
## # A tibble: 19 x 5
## # Groups:   CHR [19]
##      CHR outlier     n Tot_window  prop
##    <dbl> <chr>   <int>      <int> <dbl>
##  1     1 outlier     4        592 0.676
##  2     2 outlier     3        369 0.813
##  3     3 outlier     7        330 2.12 
##  4     4 outlier     2        271 0.738
##  5     5 outlier     7        269 2.60 
##  6     6 outlier     6        249 2.41 
##  7     7 outlier     4        253 1.58 
##  8     8 outlier     4        226 1.77 
##  9     9 outlier     7        206 3.40 
## 10    10 outlier     5        161 3.11 
## 11    11 outlier     3        158 1.90 
## 12    13 outlier     4        115 3.48 
## 13    14 outlier     3         70 4.29 
## 14    15 outlier     5         62 8.06 
## 15    17 outlier     5         52 9.62 
## 16    18 outlier     1         51 1.96 
## 17    19 outlier     4         52 7.69 
## 18    20 outlier     1         45 2.22 
## 19    21 outlier     3         37 8.11
ungroup(TE_insertions_per_window) %>% dplyr::count(outlier) 
## # A tibble: 2 x 2
##   outlier         n
##   <chr>       <int>
## 1 non_outlier  3691
## 2 outlier        78

There is one window that is much higher than anything else on chromosome 12. This is strange, so I started looking more specifically at this partilar locus. Is the increase in TIPs due to only one type of TE? Is it even more localized than the 10kb-window I’ve looked at?

temp = TE_insertions %>% filter(CHR == 12) %>%
  mutate(window = trunc(((position + end)/2)/10000)) %>% filter(window == 52)

p1 = temp %>% 
  dplyr::select(Superfamily, TE_family, TE_insertion ) %>% 
  distinct() %>% 
  ggplot(aes(x = Superfamily, fill = TE_family)) + 
  geom_bar() + 
  scale_fill_manual(values = rep(c("#f6bd60", "#f6ca83", "#f7ede2", "#f5cac3", "#84a59d", "#73683b", "#d0d38f", "#cd5b72", "#f28482"), 10)) 
p1 + theme(legend.position = "none")

cowplot::plot_grid(get_legend(p1))

temp %>% 
  mutate(pos = trunc((position + end)/200)) %>%
  group_by(TE_insertion, TE_family, Superfamily, ref_non_ref_TIP, pos) %>% 
  ggplot(aes(x = pos*100, fill = Continent)) +
  geom_bar() + 
  theme_bw() +
  facet_grid(rows = vars(Superfamily), scales = "free_y") +
  Fill_Continent

bind_rows(read_delim(paste0(TE_RIP_dir, "Non-ref_all_strains.bed"), 
           col_names = c("ID_file", "chromosome", "position", "end", "info", "score", "strand"), delim = "\t") %>%
             mutate(ref_non_ref_TIP = "non_ref"),
          read_delim(paste0(TE_RIP_dir, "Ref_all_strains.bed"), 
           col_names = c("ID_file", "chromosome", "position", "end", "info", "score", "strand"), delim = "\t") %>%
             mutate(ref_non_ref_TIP = "ref")) %>%
  separate(col = chromosome, into = c("X1", "CHR", "X2", "X3", "X4"), sep = "_") %>%
  separate(col = info, into = c("TE_family", "TSD", "Allele_Frequency", "3_support", "5_support", "ref_reads"), sep = "\\|") %>%
  dplyr::select(-c(X1, X2, X3, X4, score, strand)) %>%
  dplyr::select(CHR, position, end, TE_family) %>%
  distinct() %>%
  write_tsv(paste0(TE_RIP_dir, "Insertions.real.bed"), col_names = F)
  

gff = read_delim(gff_file, col_names = c("CHR", "X1", "mRNA", "Start", "Stop", "X2", "X3", "X4", "Annot"), 
                 delim = "\t", comment = "#")  %>%
  filter(mRNA == "mRNA") %>%
  separate(col = Annot, into = c("ID", "Parent", "Name"), sep = ";") %>%
  mutate(Name = str_remove(Name, "Name=")) 
~/Documents/Software/bedtools2/bin/bedtools  sort -i ~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Zymoseptoria_tritici.MG2.Grandaubert2015.no_CDS.gff3 | grep "mRNA" > ~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Zymoseptoria_tritici.MG2.Grandaubert2015.no_CDS.mRNA.gff3

~/Documents/Software/bedtools2/bin/bedtools  sort -i /Users/afeurtey/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/Insertions.real.bed > /Users/afeurtey/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/Insertions.sorted.bed

 ~/Documents/Software/bedtools2/bin/bedtools closest -a ~/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/Insertions.sorted.bed -b ~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Zymoseptoria_tritici.MG2.Grandaubert2015.no_CDS.mRNA.gff3 -d > ~/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/Insertions_closest_genes.tab
 
distance_TE_genes = read_delim(paste0(TE_RIP_dir, "Insertions_closest_genes.tab"),
         col_names = c("CHR", "position", "end", "TE_family", "X5", "X1", "mRNA", 
                       "Start", "Stop", "X2", "X3", "X4", "Annot", "Distance"), delim = "\t") %>%
  dplyr::select(-c(X1, X2, X3, X4, X5, mRNA)) %>% 
  mutate(CHR = as.character(CHR)) %>%  
  mutate(position = 100*trunc(position/100), end = 100*trunc(end/100)) %>%
  unite(TE_family, CHR, position, end, col = "TE_insertion", sep = ":", remove = F) 
  
temp = distance_TE_genes %>%  
  full_join(TE_insertions)

TE_insertions %>%
  unite(Continent, ID_file, col = "for_display", remove = F) %>%
   dplyr::count(TE_insertion)
## # A tibble: 80,511 x 2
##    TE_insertion                              n
##    <chr>                                 <int>
##  1 DHH_Ada_consensus-2:1:1002400:1002400     1
##  2 DHH_Ada_consensus-2:1:1002500:1002500     1
##  3 DHH_Ada_consensus-2:1:1043800:1043800     1
##  4 DHH_Ada_consensus-2:1:1062900:1062900     2
##  5 DHH_Ada_consensus-2:1:1073200:1073200     1
##  6 DHH_Ada_consensus-2:1:1102700:1102700     2
##  7 DHH_Ada_consensus-2:1:1102800:1102800     1
##  8 DHH_Ada_consensus-2:1:112100:112100       1
##  9 DHH_Ada_consensus-2:1:1122500:1122500     6
## 10 DHH_Ada_consensus-2:1:1128600:1128600     1
## # … with 80,501 more rows
#Categories based on allelic frequencies
p1 = TE_insertions_counts %>%
  mutate(for_bar = case_when(n == 1 ~ "01", 
                             n == 2 ~ "02",
                             n <= 10 ~ "10",
                             n <= 100 ~ "100",
                             n >= 100 ~ "100 +")) %>%
  inner_join(distance_TE_genes) %>%
  ggplot(aes(x = for_bar, y = log(Distance))) + 
  geom_violin()  +
  geom_boxplot(width = .2) +
  theme_bw()

#Categories based on population specificity
temp  = insertions_per_pop %>%
  filter(Nb_population == 1) %>%
  dplyr::select(-c(NA_count, Nb_population, n)) %>%
  pivot_longer(cols = -c(TE_insertion, ref_non_ref_TIP), names_to = "Specificity_category", values_to = "Nb") %>%
  filter(!is.na(Nb)) %>%
  dplyr::select(-Nb)

p2 = insertions_per_pop %>%
  filter(Nb_population > 1) %>%
  dplyr::select(TE_insertion) %>%
  mutate(Specificity_category = "Non_specific") %>%
  bind_rows(temp) %>%
  inner_join(distance_TE_genes) %>%
  ggplot(aes(x = Specificity_category, y = log(Distance))) + 
  geom_violin() +
  geom_boxplot(width = .2) + 
  theme_bw()


cowplot::plot_grid(p1, p2)

#
temp = inner_join(read_delim(paste0(TE_RIP_dir, "Insertions_closest_genes.downstream.tab"),
         col_names = c("CHR", "position", "end", "TE_family", "X5", "X1", "mRNA", 
                       "Start", "Stop", "X2", "X3", "X4", "Annot", "Distance"), delim = "\t") %>% 
  mutate(CHR = as.character(CHR)) %>%  
  mutate(position = 100*trunc(position/100), end = 100*trunc(end/100)) %>%
  unite(TE_family, CHR, position, end, col = "TE_insertion", sep = ":", remove = F) %>%
  dplyr::select(TE_insertion, Distance_downstream = Distance),
  read_delim(paste0(TE_RIP_dir, "Insertions_closest_genes.upstream.tab"),
         col_names = c("CHR", "position", "end", "TE_family", "X5", "X1", "mRNA", 
                       "Start", "Stop", "X2", "X3rm", "X4", "Annot", "Distance"), delim = "\t") %>% 
  mutate(CHR = as.character(CHR)) %>%  
  mutate(position = 100*trunc(position/100), end = 100*trunc(end/100)) %>%
  unite(TE_family, CHR, position, end, col = "TE_insertion", sep = ":", remove = F) %>%
  dplyr::select(TE_insertion, Distance_upstream = Distance))

ggplot(temp, aes(x = abs(Distance_downstream), y = abs(Distance_upstream))) +
  geom_hex() + 
  theme_bw() + 
  scale_x_continuous(trans = "log10")+ 
  scale_y_continuous(trans = "log10")

temp = inner_join(temp, TE_insertions_counts)
p1 = temp %>% filter(ref_non_ref_TIP == "ref") %>%
ggplot(aes(x = abs(Distance_downstream), y = abs(Distance_upstream))) +
  geom_hex() + 
  theme_bw() + 
  scale_x_continuous(trans = "log10") + 
  scale_y_continuous(trans = "log10") +
  labs(title = "Distance to gene of reference TIP") +
  theme(legend.position = "bottom") 

p2 = temp %>% filter(ref_non_ref_TIP != "ref") %>%
ggplot(aes(x = abs(Distance_downstream), y = abs(Distance_upstream))) +
  geom_hex() + 
  theme_bw() + 
  scale_x_continuous(trans = "log10") + 
  scale_y_continuous(trans = "log10") +
  labs(title = "Distance to gene of non-reference TIP")+
  theme(legend.position = "bottom")

plot_grid(p1, p2)





Repeat-Induced Point mutation

I now look at the repeat-induced point mutations in reads that map on the different TE consensus. We expect to see differences in the different geographical groups so I start by visualizing this.

#while read p; do fichier_list=$(ls -1 /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/3_RIP_estimation/${p}\.*txt) ; for fichier in $fichier_list; do short_name=$(echo $fichier | cut -f 8 -d "/" | cut -f 2 -d "." ) ; comp=$(grep Composite $fichier) ; echo $p $short_name $comp ;  done; done < Keep_lists_samples/Ztritici_global_March2021.genotyped.good_samples.args > /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/Composite_index.txt 

RIP=read_delim(paste0(RIP_DIR, "Composite_index.txt"),
             col_names = c("ID_file", "TE",  "Variable", "Composite_median", "Composite_mean" ), 
             delim = " ", na = c("nan", "NA", "")) %>%
  separate(TE, into = c("Superfamily", "TE_id"), sep = "_", remove = F, extra = "merge") %>%
  mutate(Order = ifelse(!grepl('^D',TE), "Class II (DNA transposons)", "Class I (retrotransposons)"))%>%
  left_join(Zt_meta %>% dplyr::select(Continent, Country, Collection, ID_file)) %>%
  unite(Continent, Country, ID_file, col = "for_display", remove = F) %>%
  left_join(., reads_per_TE)
#DONE: merge with reads_per_TE to be able to filter the points that have too few reads mapped!

#Per continent

world_RIP_avg <-
  RIP %>%
  filter(TE == "RIP_est") %>%
  dplyr::summarize(avg = mean(as.numeric(Composite_median), na.rm = T)) %>%
  pull(avg)

temp = RIP %>%
  filter(TE == "RIP_est") %>%
  filter(!is.na(Continent)) %>%
  group_by(Continent) %>%
  dplyr::mutate(region_avg = mean(as.numeric(Composite_median), na.rm = T))

RIP_plot = ggplot(temp, aes(x = Continent,
                            y = as.numeric(Composite_median),
                            color = Continent))   +
  coord_flip() +
  geom_segment(aes(x = Continent, xend = Continent,
        y = world_RIP_avg, yend = region_avg), size = 0.8) +
  geom_jitter(size = 1.5, alpha = 0.2, width = 0.2) +
  geom_hline(aes(yintercept = world_RIP_avg), color = "gray70", size = 0.6) +
  stat_summary(fun = mean, geom = "point", size = 5) +
  Color_Continent +
  theme_cowplot() +
    theme(legend.position = "none",
          axis.text.x = element_text(angle = 40, hjust = 1)) +
  labs (x = "", y = "RIP composite index",
        title = "RIP levels per continents",
        subtitle = str_wrap(paste("The RIP levels in reads mapping on TE consensus",
                                  "are high in the Middle-East",
                                  "and low in North America in particular."), width = 70))


#Statistical test
#One-way ANOVA with blocks
##Define linear model
model = lm(as.numeric(Composite_median) ~ Continent + Collection ,
          data=temp)
summary(model)   ### Will show overall p-value and r-squared
## 
## Call:
## lm(formula = as.numeric(Composite_median) ~ Continent + Collection, 
##     data = temp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.45164 -0.04214  0.00525  0.04319  0.50785 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       1.76372    0.03086  57.144  < 2e-16 ***
## ContinentAsia                    -0.13501    0.10119  -1.334   0.1824    
## ContinentEurope                  -0.22017    0.02011 -10.948  < 2e-16 ***
## ContinentMiddle East              0.13058    0.02027   6.443 1.77e-10 ***
## ContinentNorth America           -0.44765    0.01948 -22.974  < 2e-16 ***
## ContinentOceania                 -0.40964    0.02634 -15.553  < 2e-16 ***
## ContinentSouth America           -0.31207    0.02138 -14.598  < 2e-16 ***
## CollectionEschikon_2017          -0.15579    0.02544  -6.123 1.29e-09 ***
## CollectionETHZ_2020              -0.04931    0.02794  -1.765   0.0779 .  
## CollectionFillinger              -0.15375    0.07216  -2.131   0.0334 *  
## CollectionGWASpanel_BIOGER       -0.13263    0.02553  -5.195 2.45e-07 ***
## CollectionHartmann_FstQst_2015   -0.36444    0.02693 -13.532  < 2e-16 ***
## CollectionHartmann_Oregon_2016   -0.16815    0.02937  -5.725 1.34e-08 ***
## CollectionJGI                     0.02059    0.03034   0.678   0.4976    
## CollectionJGI_2                  -0.01836    0.03006  -0.611   0.5414    
## CollectionJGI_3                  -0.16092    0.03328  -4.835 1.53e-06 ***
## CollectionJGI_4                  -0.10925    0.09933  -1.100   0.2717    
## CollectionJGI_Thierry            -0.06220    0.02896  -2.148   0.0319 *  
## CollectionMegan_McDonald         -0.07177    0.04165  -1.723   0.0851 .  
## CollectionMMcDonald_2018         -0.02227    0.03345  -0.666   0.5058    
## CollectionSyngenta               -0.23710    0.02411  -9.836  < 2e-16 ***
## CollectionThird_batch_2018_BM_TM -0.05951    0.03457  -1.721   0.0855 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09653 on 1067 degrees of freedom
##   (16 observations deleted due to missingness)
## Multiple R-squared:  0.7924, Adjusted R-squared:  0.7883 
## F-statistic:   194 on 21 and 1067 DF,  p-value: < 2.2e-16
##Conduct analysis of variance
Anova(model,type = "II")  
## Anova Table (Type II tests)
## 
## Response: as.numeric(Composite_median)
##             Sum Sq   Df F value    Pr(>F)    
## Continent  17.3013    6 309.435 < 2.2e-16 ***
## Collection 11.2446   15  80.445 < 2.2e-16 ***
## Residuals   9.9431 1067                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model)
## 
## Call:
## lm(formula = as.numeric(Composite_median) ~ Continent + Collection, 
##     data = temp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.45164 -0.04214  0.00525  0.04319  0.50785 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       1.76372    0.03086  57.144  < 2e-16 ***
## ContinentAsia                    -0.13501    0.10119  -1.334   0.1824    
## ContinentEurope                  -0.22017    0.02011 -10.948  < 2e-16 ***
## ContinentMiddle East              0.13058    0.02027   6.443 1.77e-10 ***
## ContinentNorth America           -0.44765    0.01948 -22.974  < 2e-16 ***
## ContinentOceania                 -0.40964    0.02634 -15.553  < 2e-16 ***
## ContinentSouth America           -0.31207    0.02138 -14.598  < 2e-16 ***
## CollectionEschikon_2017          -0.15579    0.02544  -6.123 1.29e-09 ***
## CollectionETHZ_2020              -0.04931    0.02794  -1.765   0.0779 .  
## CollectionFillinger              -0.15375    0.07216  -2.131   0.0334 *  
## CollectionGWASpanel_BIOGER       -0.13263    0.02553  -5.195 2.45e-07 ***
## CollectionHartmann_FstQst_2015   -0.36444    0.02693 -13.532  < 2e-16 ***
## CollectionHartmann_Oregon_2016   -0.16815    0.02937  -5.725 1.34e-08 ***
## CollectionJGI                     0.02059    0.03034   0.678   0.4976    
## CollectionJGI_2                  -0.01836    0.03006  -0.611   0.5414    
## CollectionJGI_3                  -0.16092    0.03328  -4.835 1.53e-06 ***
## CollectionJGI_4                  -0.10925    0.09933  -1.100   0.2717    
## CollectionJGI_Thierry            -0.06220    0.02896  -2.148   0.0319 *  
## CollectionMegan_McDonald         -0.07177    0.04165  -1.723   0.0851 .  
## CollectionMMcDonald_2018         -0.02227    0.03345  -0.666   0.5058    
## CollectionSyngenta               -0.23710    0.02411  -9.836  < 2e-16 ***
## CollectionThird_batch_2018_BM_TM -0.05951    0.03457  -1.721   0.0855 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09653 on 1067 degrees of freedom
##   (16 observations deleted due to missingness)
## Multiple R-squared:  0.7924, Adjusted R-squared:  0.7883 
## F-statistic:   194 on 21 and 1067 DF,  p-value: < 2.2e-16
hist(residuals(model), col="darkgray")

#Post-hoc analysis:  mean separation tests
marginal = lsmeans(model, ~ Continent)

pairs(marginal, adjust="sidak")
##  contrast                      estimate     SE   df t.ratio p.value
##  Africa - Asia                   0.1350 0.1012 1067   1.334  0.9854
##  Africa - Europe                 0.2202 0.0201 1067  10.948  <.0001
##  Africa - Middle East           -0.1306 0.0203 1067  -6.443  <.0001
##  Africa - North America          0.4476 0.0195 1067  22.974  <.0001
##  Africa - Oceania                0.4096 0.0263 1067  15.553  <.0001
##  Africa - South America          0.3121 0.0214 1067  14.598  <.0001
##  Asia - Europe                   0.0852 0.0998 1067   0.853  1.0000
##  Asia - Middle East             -0.2656 0.1006 1067  -2.639  0.1630
##  Asia - North America            0.3126 0.1004 1067   3.113  0.0391
##  Asia - Oceania                  0.2746 0.1018 1067   2.697  0.1390
##  Asia - South America            0.1771 0.1007 1067   1.758  0.8223
##  Europe - Middle East           -0.3508 0.0158 1067 -22.182  <.0001
##  Europe - North America          0.2275 0.0147 1067  15.499  <.0001
##  Europe - Oceania                0.1895 0.0217 1067   8.744  <.0001
##  Europe - South America          0.0919 0.0188 1067   4.894  <.0001
##  Middle East - North America     0.5782 0.0146 1067  39.586  <.0001
##  Middle East - Oceania           0.5402 0.0221 1067  24.417  <.0001
##  Middle East - South America     0.4427 0.0183 1067  24.160  <.0001
##  North America - Oceania        -0.0380 0.0207 1067  -1.839  0.7624
##  North America - South America  -0.1356 0.0174 1067  -7.781  <.0001
##  Oceania - South America        -0.0976 0.0251 1067  -3.886  0.0023
## 
## Results are averaged over the levels of: Collection 
## Note: contrasts are still on the as.numeric scale 
## P value adjustment: sidak method for 21 tests
CLD_RIP = cld(marginal,
          alpha   = 0.05,
          Letters = letters,  ### Use lower-case letters for .group
          adjust  = "sidak")  ### sidak-adjusted p-values

CLD_RIP
##  Continent     lsmean     SE   df lower.CL upper.CL .group 
##  North America   1.21 0.0132 1067     1.17     1.24  a     
##  Oceania         1.25 0.0191 1067     1.19     1.30  ab    
##  South America   1.34 0.0180 1067     1.29     1.39    c   
##  Europe          1.43 0.0104 1067     1.41     1.46     d  
##  Asia            1.52 0.0997 1067     1.25     1.79   bcdef
##  Africa          1.65 0.0195 1067     1.60     1.71      e 
##  Middle East     1.79 0.0149 1067     1.75     1.83       f
## 
## Results are averaged over the levels of: Collection 
## Results are given on the as.numeric (not the response) scale. 
## Confidence level used: 0.95 
## Conf-level adjustment: sidak method for 7 estimates 
## Note: contrasts are still on the as.numeric scale 
## P value adjustment: sidak method for 21 tests 
## significance level used: alpha = 0.05 
## NOTE: Compact letter displays can be misleading
##       because they show NON-findings rather than findings.
##       Consider using 'pairs()', 'pwpp()', or 'pwpm()' instead.
CLD_RIP$.group=gsub(" ", "", CLD_RIP$.group)

text_y = (max(as.numeric(temp$Composite_median), na.rm = T) + 0.1*max(as.numeric(temp$Composite_median), na.rm = T))

RIP_plot +
  geom_text(data = CLD_RIP, aes(x = Continent,
                            y = text_y,
                            label = .group), color = "black")

It is known that different TE groups, in particular the MITEs, which are particularly small are less RIPped than other types of TEs. I wanted to check whether we saw such a pattern and so I visualize here the RIP per superfamily of TEs and then as related to the size of the consensus.

#Per  TE superfamily
RIP  %>%
  filter(Normalized_nb_reads_mapped > 0.0001) %>%
  group_by(Superfamily)%>%
  mutate(median_Superfamily=median(Composite_median, na.rm = T) )%>%
  ggplot(aes(x = Superfamily,
             y = as.numeric(Composite_median),
             fill = median_Superfamily)) +
    geom_boxplot(outlier.shape = NA) +
    theme_bw() +
    ylab("Median of composite index on TE reads") +
    theme(legend.position = "none",
          axis.text.x = element_text(angle = 40, hjust = 1)) +
  ylim(-1, 5) + geom_hline( yintercept = 0, color = "navy")

temp = RIP  %>%
  filter(Normalized_nb_reads_mapped > 0.0001) %>%
  group_by(Superfamily, Continent, Order)%>%
  dplyr::summarize(median_Superfamily=median(Composite_median, na.rm = T) )

temp %>%
  filter(Order == "Class I (retrotransposons)") %>%
  ggplot(aes(x = Superfamily,
             y = median_Superfamily,
             fill = Continent)) +
    geom_bar(stat = "identity", position=position_dodge()) +
  Fill_Continent +    
  theme_bw() +
    ylab("Median of composite index on TE reads") +
    theme(axis.text.x = element_text(angle = 40, hjust = 1))

temp %>%
  filter(Order == "Class II (DNA transposons)") %>%
  ggplot(aes(x = Superfamily,
             y = median_Superfamily,
             fill = Continent)) +
    geom_bar(stat = "identity", position=position_dodge()) +
  Fill_Continent +    
  theme_bw() +
    ylab("Median of composite index on TE reads") +
    theme(axis.text.x = element_text(angle = 40, hjust = 1))

#As relating to TE size
#NB: in the following plot, the alpha parameter is set to make the TE without any reads (or near so) invisible
#This means that not all consensus are visible. In particular, some, annotated  by Ursula as "verylowcopy" are not.
TE_consensus_faidx =read_delim(paste0(TE_RIP_dir, "Badet_BMC_Biology_2020_TE_consensus_sequences.fasta.fai"),
             col_names = c("TE",  "length",  "offset",  
             "number of bases per line",  "number of bytes per line"), delim = "\t")

p = inner_join(RIP, TE_consensus_faidx) %>%
  dplyr::group_by(Order, TE, length) %>%
  filter (TE != "RIP_est") %>%
  filter(!is.na(Normalized_nb_reads_mapped)) %>%
  dplyr::summarize(median_per_consensus=median(Composite_median, na.rm = T),
                   read_mapped = median(Normalized_nb_reads_mapped), na.rm = T) %>%
  ggplot(aes(x = log(length),
             y = median_per_consensus,
             color = Order)) +
  geom_point(aes( text = TE,
             alpha = log(read_mapped)))  + theme_bw() +
  ylim(c(-2, 4)) + geom_hline(yintercept = 0) +
  labs(x = "TE length (log-scale)",
       y = "Median of RIP composite index",
       title = str_wrap(paste("Median of the RIP composite index for each TE consensus",
                        "against the length of the consensus sequence"), width = 60)) # + geom_smooth(span = 1.5, fill = NA, size =0.7)

ggplotly(p)
p = inner_join(RIP, TE_consensus_faidx) %>%
  filter (TE != "RIP_est") %>%
  dplyr::group_by(Order, TE, length) %>%
  dplyr::summarize(median_per_consensus=median(Composite_median, na.rm = T),
                   read_mapped = median(Normalized_nb_reads_mapped), na.rm = T) %>%
  filter(str_detect(TE, pattern = "SINE") | str_detect(TE, pattern = "MITE") ) %>%
  ggplot(aes(x = log(length),
             y = median_per_consensus,
             color = Order, text = TE,
             alpha = log2(read_mapped))) +
  geom_point()  + theme_bw() +
  ylim(c(-2, 4)) + geom_hline(yintercept = 0)

ggplotly(p)

Finally, I look at the relation between the amount of reads mapping on TE consensus and the level of RIP detected. I also investigate several possible bias.

TE_RIP = inner_join(TE_qty, RIP %>% filter(TE == "RIP_est"))

TE_RIP$Sampling_Date[is.na(TE_RIP$Sampling_Date)] <- "Unknown"

temp = TE_RIP %>%
  group_by(Continent) %>%
  dplyr::summarize(TE_qty = mean(Percent_TE_Reads, na.rm = T),
                  Composite_median = mean(as.numeric(Composite_median), na.rm = T)) %>%
  dplyr::mutate(for_display = Continent)

#TE and RIP together
t = ggplot(TE_RIP, aes(as.numeric(Percent_TE_Reads),
                          as.numeric(Composite_median),
                          color = Continent,
                          text = for_display))+
  theme_cowplot()  +
  geom_point(alpha = 0.6) + Color_Continent +
  labs(color = "Geographical group",
       x = "Percentage of TE reads", y = "RIP composite median",
       title = "Amount of transposable elements vs RIP level") +
  geom_point(data = temp, aes(as.numeric(TE_qty),
                                as.numeric(Composite_median),
                                color = Continent), size = 5)
#ggplotly(t)
t

TE_RIP %>%
  filter(Collection != "Hartmann_FstQst_2015") %>%
ggplot(aes(as.numeric(Total_insertions),
                          as.numeric(Composite_median),
                          color = Continent,
                          text = for_display))+
  theme_cowplot()  +
  geom_point(alpha = 0.6) + Color_Continent +
  labs(color = "Geographical group",
       x = "TE insertion number", y = "RIP composite median",
       title = "Amount of transposable elements vs RIP level") 

bias1 = TE_RIP %>% ggplot(aes(Percent_TE_Reads, as.numeric(Composite_median), color = Collection, text = for_display)) +
  theme_cowplot() +
  geom_point(alpha = 0.8)

#bias2 = TE_RIP %>% ggplot(aes(Percent_TE_Reads, as.numeric(Composite_median), color =Library_strategy, text = for_display)) +
#  theme_cowplot() +
#  geom_point(alpha = 0.8)

ggplotly(bias1)
cowplot::plot_grid(RIP_plot +
                     labs (title = "", subtitle = ""), #+
                     #geom_text(data = CLD_RIP, aes(x = Continent,
                      #                         y = text_y,
                      #                         label = .group),
                      #         color = "black"),
                   TE_prop +
                     labs (title = "", subtitle = "") +
                     #geom_text(data = CLD, aes(x = Continent,
                      #                         label = .group,
                      #                         y = 34),
                      #         color   = "black") +
                     theme(legend.position = "none") +
                     coord_flip())

subset = TE_RIP %>%
  filter(Collection != "Hartmann_FstQst_2015")

temp = subset %>%
  group_by(Continent) %>%
  dplyr::summarize(TE_qty = mean(Percent_TE_Reads, na.rm = T),
                  Composite_median = mean(as.numeric(Composite_median), na.rm = T)) %>%
  dplyr::mutate(for_display = Continent)

t = ggplot(subset, aes(as.numeric(Percent_TE_Reads),
             as.numeric(Composite_median),
             color = Continent,
             text = for_display)) +
  theme_cowplot()  +
  geom_point(alpha = 0.6) + Color_Continent +
  labs(color = "Geographical group",
       x = "Percentage of TE reads", y = "RIP composite median",
       title = "Amount of transposable elements vs RIP level") +
  geom_point(data = temp, aes(as.numeric(TE_qty),
                                as.numeric(Composite_median),
                                color = Continent), size = 5)
ggplotly(t)

The RIP index does seem consistent so far with what Cécile has found, with higher RIP in the Middle-East and African populations and lower in the rest (in particular North America and Oceania).

RIP along the chromosomes

RIP_in_high_copies_TE = read_delim(paste0(TE_RIP_dir, "All_genomes.all_TEs.GC.RIP.tab"),
             col_names = c("Sample", "Seq_ID", "GC", "Product_index", "Substrate_index", "Composite_index"),
             delim = "\t")

temp = RIP_in_high_copies_TE %>%
  separate(Seq_ID, into = c("Sample", "Chrom", "Start", "End"),
           sep = "\\.|-|:", remove = F) %>%
  dplyr::mutate(Start = as.integer(Start), End = as.integer(End)) %>%
  dplyr::mutate(Coord = (Start + End)/2)

#temp %>%
#  filter(Chrom == "chr_4") %>%
#  filter(Composite_index <= 2 & Composite_index >= - 2 )%>%
#  ggplot(aes(Coord, Composite_index, col = Composite_index)) +
#    geom_point() +
#    facet_wrap (vars(Sample), ncol = 1) +
#  theme_bw() +
#  scale_color_viridis_c()

temp %>%
  dplyr::mutate(Nb = str_pad(trunc((Coord / 100000), 0), 6, pad = "0")) %>%
  filter(Composite_index >= -2 & Composite_index<= 2) %>%
  #filter(Chrom == "chr_1") %>%
  unite(Window, Chrom, Nb) %>%
  group_by(Sample, Window) %>%
  dplyr::summarize(Composite = mean(Composite_index, na.rm = T)) %>%
  ggplot(aes(Window, Sample, fill = Composite)) +
    geom_tile() +
  theme_bw() +
  scale_fill_viridis_c()

temp %>%
  filter(Composite_index >= -3 & Composite_index<= 3) %>%
  mutate(Chr = as.integer(str_replace(Chrom, "chr_", ""))) %>%
  group_by(Sample, Chr) %>%
  dplyr::summarize(Composite = mean(Composite_index, na.rm = T)) %>%
  ggplot(aes(Chr, Sample, fill = Composite)) +
    geom_tile() +
  theme_bw() +
  scale_fill_viridis_c()

Is dim2 present in its intact version in some populations?

Next step will be to check if dim2 is present where the RIP values suggest they are: intact copies in the Middle-East and Africa and absence/degeneration in the rest. Here, I use de novo genome assemblies and try to identify copies of dim2. For this, I use a deRIPped sequence of dim2 in the reference, blasted it on to Zt10 to get the sequence of Zt10_dim2 since it is known to have an intact sequence (see Mareike’s paper). I then compare this sequence (blastn) with de de novo assemblies to pull all the copies and identify the highest identity score.

#Here the Zt10_dim2_from_MgDNMT_deRIP.fa is the sequence of Zt10_6.417 which corresponds to the deRIPPed version of dim2 in the reference IPO323 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2940312/pdf/GEN186167.pdf


#sbatch -p normal.168h --array=1-1195%50 Detect_gene_blast_array.sh /home/alice/WW_PopGen/Directories_new.sh /data2/alice/WW_project/0_Data/Zt10_dim2_from_MgDNMT_deRIP.fa /home/alice/WW_PopGen/Keep_lists_samples/Good_assemblies.args Illumina /data2/alice/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/1_Blast_dim2_deRIPped/

Let’s look at the results as dot plots and compare the results from the dim2 blast to the RIP composite index. So far the Middle-Eastern samples seem quite similar to one another, whereas other regions contain way more variability such as Europe.

In order to identify the native dim2 copy in genomes, I look for several things:

  • the blast match has to be on the same contig as at least one of the two known flanking genes
  • the blast match has to be between the two flanking genes if both are on the same contig or less than 10 kb from the known flanking gene on the same contig
#system(paste0("cat ", DIM2_DIR, "*.blast.tab > ", DIM2_DIR, "Overall_results_blast_dim2.txt"))

length_dim2 = 3846
length_flank1 = 3689
length_flank2 = 1222
threshold_length_dim2 = 0.8 * length_dim2
threshold_length_flank1 = 0.8 * length_flank1
threshold_length_flank2 = 0.8 * length_flank2

dim2_blast_results_complete = read_delim(paste0(DIM2_DIR, "Overall_results_blast_dim2.txt"), delim = " ",
                                col_names = c("sample", "gene", "qseqid", "sseqid", "pident", "length",
                                              "mismatch", "gapopen", "qstart", "qend",
                                              "sstart", "send", "evalue", "bitscore"))

flank1 = dim2_blast_results_complete %>%
  filter(gene == "dim2_flank1_Zt10_unitig_006_0418") %>%
  dplyr::select(sample, gene, sseqid, length, sstart, send) %>%
  dplyr::mutate(midcoord_flank1 = (sstart + send)/2) %>%
  filter(length > threshold_length_flank1) %>%
  pivot_wider(names_from = gene, values_from = sseqid) %>%
  dplyr::select(-length, -sstart, -send) %>%
  group_by(sample) %>%
  dplyr::mutate(nb_gene = n(), 
                dim2_flank1 = dim2_flank1_Zt10_unitig_006_0418 )
  

flank2 = dim2_blast_results_complete %>%
  filter(gene == "dim2_flank2_Zt10_unitig_006_0416") %>%
  dplyr::select(sample, gene, sseqid, length, sstart, send) %>%
  dplyr::mutate(midcoord_flank2 = (sstart + send)/2) %>%
  filter(length > threshold_length_flank2) %>%
  pivot_wider(names_from = gene, values_from = sseqid) %>%
  dplyr::select(-length, -sstart, -send) %>%
  group_by(sample) %>%
  dplyr::mutate(nb_gene = n(), 
                dim2_flank2 = dim2_flank2_Zt10_unitig_006_0416)

#First let's extract some information such as finding the middle coordinates for the
# 3 genes of interest
dim2_blast_results = dim2_blast_results_complete %>%
  dplyr::filter(gene == "Zt10_dim2_from_MgDNMT_deRIP") %>%
  full_join(., flank1, by = "sample") %>%
  full_join(., flank2, by = "sample") %>%
  dplyr::mutate(dim2_flank1 = replace_na(dim2_flank1, "Not_found"),
                dim2_flank2 = replace_na(dim2_flank2, "Not_found"),
                midcoord_flank1 = replace_na(midcoord_flank1, "0"),
                midcoord_flank2 = replace_na(midcoord_flank2, "0")) %>%
  dplyr::mutate(midcoord_flank1 = as.numeric(midcoord_flank1),
                midcoord_flank2 = as.numeric(midcoord_flank2))  %>%
  dplyr::mutate(ave_coord = (sstart + send)/2)  %>%
  dplyr::mutate(min_fl = ifelse(midcoord_flank1 > midcoord_flank2,
                                midcoord_flank2, midcoord_flank1))  %>%
  dplyr::mutate(max_fl = ifelse(midcoord_flank1 > midcoord_flank2,
                                midcoord_flank1, midcoord_flank2))

#Now, let's compare the blast results of dim2 with the flanking genes
# (contig name and distance)
dim2_blast_results = dim2_blast_results %>%
  dplyr::mutate(is_same_contig = ifelse(sseqid == dim2_flank1 & sseqid == dim2_flank2,
                                 "Both",
                                 ifelse(sseqid != dim2_flank1 & sseqid != dim2_flank2,
                                 "None",
                                 ifelse(sseqid == dim2_flank1, "Flank1",
                                 ifelse(sseqid == dim2_flank2, "Flank2", "What"))))) %>%
  dplyr::mutate(distance = ifelse(is_same_contig == "None", "No_distance",
                           ifelse(is_same_contig == "Both",
                                  ifelse(ave_coord >= min_fl &
                                         ave_coord <= max_fl ,
                                  "Distance_OK", "Not_between"),
                                  ifelse(is_same_contig == "Flank1",
                                         ifelse((abs(midcoord_flank1 - ave_coord) <= 10000),
                                  "Distance_OK", "Too_far"),
                                  ifelse((abs(midcoord_flank2 - ave_coord) <= 10000),
                                  "Distance_OK", "Too_far"))))) %>%
  mutate(Nb_flanking_found_2cat = ifelse(is_same_contig == "None", "0",
                                         ifelse(distance == "Distance_OK", ">1", "0")))



  #dplyr::select(sample, sseqid, length, dim2_flank1, dim2_flank2, is_same_contig) %>%



dim2_blast_results %>%
  ggplot(aes(length)) +
    geom_density(alpha = 0.7) +
    theme_bw() +
    geom_vline(aes(xintercept = threshold_length_dim2), color = "gray70", size = 0.6) +
    labs(x = "Length (bp)",
         y = "Density",
         title = "Length of all blast matches against Zt10dim2",
         subtitle = str_wrap(paste("There is a large range in the size of the matches.",
                                   " In order to ignore very short matches, I could set up a threshold",
                                   " visualized here as the grey line."),
         width = 70))

However, the length also greatly varies for copies for which we were able to detect one or two of the flanking genes on the same contig.

p1 = dim2_blast_results %>%
  ggplot(aes(length, fill = is_same_contig, color = is_same_contig)) +
    geom_density(alpha = 0.7) +
    theme_bw() +
    labs(x = "Length (bp)",
         y = "Density",
         title = str_wrap("Length of all blast matches against Zt10dim2", width = 30))


p3 = dim2_blast_results %>%
  ggplot(aes(Nb_flanking_found_2cat, fill = is_same_contig)) +
    geom_bar(alpha = 0.7) +
    theme_bw() +
  labs ( x = "Number of flanking genes on the same contig",
         y = "Number of copies",
         title = str_wrap("Numbers of copies found in the same contig as the flanking genes ", width = 30)) +
  theme(legend.position = "none")

legend_b <- get_legend(
  p1 +
    guides(color = guide_legend(nrow = 1)) +
    theme(legend.position = "bottom")
)

prow <- plot_grid(
  p1 + theme(legend.position="none"),
  p3 + theme(legend.position="none"),
  align = 'vh',
  labels = c("A", "B"),
  hjust = -1,
  nrow = 1)

plot_grid(prow, legend_b, ncol = 1, rel_heights = c(1, .1))

Now, I will select only the copies which have a least one flanking gene found on the same contig. I consider these the “native” copy of the gene. And I will then make some plots using only these original copies to investigate the geographical distribution of both length and identity with the dim2 copy of Zt10.

temp = dim2_blast_results %>%
  group_by(sample) %>%
  dplyr::summarize(n_matches = n(),
                   n_long_matches = sum(length > 1000))

sum_dim2_blast = dim2_blast_results %>%
  dplyr::filter(Nb_flanking_found_2cat == ">1") %>%
  group_by(sample) %>%
  dplyr::summarize(length_sum = sum(length),
                   pident_wm = weighted.mean(pident, length),
                   n_matches_on_native_contigs = n()) %>%
  filter(length_sum < length_dim2 + 200) %>%
  inner_join(., temp) %>%
  inner_join(., RIP %>% filter(TE == "RIP_est"), by = c("sample" = "ID_file"))



#Plots of identity vs length of the original copy
scatter <- sum_dim2_blast %>%
  ggplot(aes(length_sum, pident_wm,
             color = Continent,
             shape = as.character(n_matches_on_native_contigs))) +
  geom_point() +
  theme_bw() + Color_Continent +
  theme(legend.position = "none") +
  labs( x = "Length of the recovered native copy",
        y = "Identity with Zt10dim2")

hist_top = sum_dim2_blast %>%
  filter(Continent != "Asia") %>%
  ggplot(aes(Continent, length_sum,
             fill = Continent, color = Continent)) +
  geom_boxplot(alpha = 0.6, width = 0.4) +
  theme_bw() +
  Fill_Continent + Color_Continent+
  theme(legend.position = "none",
        axis.text.x = element_text(size = 7),
        axis.text.y = element_text(size = 7)) +
  coord_flip() +
  labs( x = "", y = "")

hist_right = sum_dim2_blast %>%
  filter(Continent != "Asia") %>%
  ggplot(aes(Continent, pident_wm, fill = Continent)) +
  geom_violin(alpha = 0.6) +
  theme_bw() + Fill_Continent +
  theme(legend.position = "none",
        axis.text.x = element_text(size = 7, angle = 20, hjust = 1)) +
  labs( x = "", y = "")


#Options for top right plot
legend_b <- get_legend(
  hist_top +
    guides(color = guide_legend(nrow = 1)) +
    theme(legend.position = "right")
)

empty <- ggplot()+geom_point(aes(1,1), colour="white")+
         theme(axis.ticks=element_blank(),
               panel.background=element_blank(),
               axis.text.x=element_blank(), axis.text.y=element_blank(),           
               axis.title.x=element_blank(), axis.title.y=element_blank())




#Gather all
cowplot::plot_grid(hist_top, legend_b, scatter, hist_right,
                   ncol=2, nrow=2,
                   rel_widths=c(1.2, 1), rel_heights=c(1, 1),  
                   align = 'vh')

sum_dim2_blast %>%
  filter(Continent != "Asia") %>%
  dplyr::mutate(Nb_frag = as.character(n_matches_on_native_contigs))  %>%
  dplyr::count(Nb_frag, Collection, Continent) %>%
  ggplot(aes(Collection, Continent, color = Collection)) +
  geom_point(aes(size = n), stat = "identity", alpha = 0.6) +
  theme_bw() +
  theme(legend.position = "none",
        axis.text.x = element_text(size = 7, angle = 20, hjust = 1)) +
  labs( x = "", y = "") +
  Fill_Continent +
  facet_wrap(vars(Nb_frag))

Here a large proportion of the Middle-Eastern samples from the old Hartmann dataset have two matches on the “right” contigs: this is due to a deletion found in one of the allele of dim2 (shown in Mareike’s new version of her dim2 manuscript).

Let’s investigate quickly how many long copies there are in each genome. I’m also interested in the native match as related to RIP and geography.

p1 = sum_dim2_blast %>%
  ggplot(aes(as.numeric(Composite_median), pident_wm, color = Continent,
                      shape = Collection))+
  geom_point() + Color_Continent +
  theme_bw() + theme(legend.position = "none") +
    labs(x = str_wrap(paste("RIP composite index",
                            " (median per isolate)"),
                      width = 20),
         y = str_wrap("Identity of the native match",
                      width = 20))

p2 = sum_dim2_blast %>%
  ggplot(aes(as.numeric(Composite_median), n_long_matches, color = Continent,
                      shape = Collection))+
  geom_point() + Color_Continent +
  theme_bw() + theme(legend.position = "none") +
    labs(x = str_wrap(paste("RIP composite index",
                            " (median per isolate)"),
                      width = 20),
         y = str_wrap("Number of long blast matches (above 1kb)",
                      width = 20))

p3 = sum_dim2_blast %>%
  ggplot(aes(x = pident_wm, y = n_long_matches, color = Continent,
                      shape = Collection)) +
  geom_point() + Color_Continent + theme_bw()+
    labs(x = str_wrap("Identity of the native match",
                      width = 20),
         y = str_wrap("Number of long blast matchess (above 1kb)",
                      width = 20),
         color = "Geographical group") +
  theme(axis.title=element_text(size=10))

bottom_row <- cowplot::plot_grid(p1, p2, labels = c('B', 'C'), label_size = 12)


cor.test(sum_dim2_blast$pident_wm, sum_dim2_blast$Composite_median, method="spearman")
## 
##  Spearman's rank correlation rho
## 
## data:  sum_dim2_blast$pident_wm and sum_dim2_blast$Composite_median
## S = 89748704, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.2710887
cor.test(sum_dim2_blast$n_long_matches, sum_dim2_blast$Composite_median, method="spearman")
## 
##  Spearman's rank correlation rho
## 
## data:  sum_dim2_blast$n_long_matches and sum_dim2_blast$Composite_median
## S = 129899429, p-value = 0.09839
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##         rho 
## -0.05500309
cor.test(sum_dim2_blast$pident_wm, sum_dim2_blast$n_long_matches, method="spearman")
## 
##  Spearman's rank correlation rho
## 
## data:  sum_dim2_blast$pident_wm and sum_dim2_blast$n_long_matches
## S = 136729196, p-value = 0.0008775
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.1104724
cowplot::plot_grid(p3, bottom_row, labels = c('A', ''), label_size = 12, ncol = 1)

p1 = sum_dim2_blast %>%
  filter(Collection == "Hartmann_FstQst_2015") %>%
  ggplot(aes(as.numeric(Composite_median), pident_wm, color = Continent))+
  geom_point() + Color_Continent +
  theme_bw() + theme(legend.position = "none") +
    labs(x = str_wrap(paste("RIP composite index",
                            " (median per isolate)"),
                      width = 20),
         y = str_wrap("Identity of the native match",
                      width = 20))

p2 = sum_dim2_blast %>%
  filter(Collection != "Hartmann_FstQst_2015") %>%
  ggplot(aes(as.numeric(Composite_median), pident_wm, color = Continent))+
  geom_point() + Color_Continent +
  theme_bw() + theme(legend.position = "none") +
    labs(x = str_wrap(paste("RIP composite index",
                            " (median per isolate)"),
                      width = 20),
         y = str_wrap("Identity of the native match",
                      width = 20))

cowplot::plot_grid(p1, p2, labels = c('A', 'B'), label_size = 12)

And then as boxplots per continental region.

p1 = sum_dim2_blast %>%
  group_by(Continent) %>%
  dplyr::mutate(avg_per_cont = mean(pident_wm, na.rm = T)) %>%
  ggplot(aes(Continent, pident_wm,
             fill = Continent)) +
  geom_violin(aes(fill = Continent), alpha = 0.5) +
    geom_jitter(aes(col = Continent), size = 0.8, alpha = 0.8, width = 0.2, height = 0.1) +
  Fill_Continent + Color_Continent  +
    theme_light() +
    theme(
    panel.border = element_blank(),
    axis.ticks.x = element_blank(),
    legend.position = "none",
    axis.text.x = element_text(size = 10, angle = 40, hjust = 1)) +
   coord_flip() +
  labs(x = NULL,
       y = "Identity of the native copy to Zt10dim2")


p2 = ggplot(sum_dim2_blast, aes(Continent, n_long_matches,
             fill = Continent)) +
  geom_violin() + Fill_Continent +
  theme_light() +
  theme(panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    legend.position = "none",
    axis.text.x = element_text(angle = 40, hjust = 1))+
  labs(x = NULL,
       y = "Number of dim2 copies") + coord_flip()
cowplot::plot_grid(p1, p2, rel_widths = c(1, 1))

dim2 tree

# TODO
# Use beginning and end of dim2 gene (100bp) to get gene boundaries on assemblies.
# Update: I tried. But it really does not work better...
#Here is the code of the comparison in case I need to use it later still

#Selecting start and end coordinates which are found on a contig
# with at least one of the flanking genes.
dim2_start = dim2_blast_results_complete %>%
  filter(gene == "dim2_start") %>%
  full_join(., flank1) %>%
  full_join(., flank2) %>%
  mutate(dim2_flank1 = replace_na(dim2_flank1, "Not_found"),
         dim2_flank2 = replace_na(dim2_flank2, "Not_found")) %>%
  mutate(is_same_contig = ifelse(sseqid == dim2_flank1 & sseqid == dim2_flank2,
                                 "Both",
                                 ifelse(sseqid != dim2_flank1 & sseqid != dim2_flank2,
                                 "None",
                                 ifelse(sseqid == dim2_flank1, "Flank1",
                                 ifelse(sseqid == dim2_flank2, "Flank2", "What")))))%>%
  mutate(Nb_flanking_found_2cat = ifelse(is_same_contig == "None", "0", ">1"))%>%
  filter(Nb_flanking_found_2cat == ">1")

dim2_end = dim2_blast_results_complete %>%
  filter(gene == "dim2_end") %>%
  full_join(., flank1) %>%
  full_join(., flank2) %>%
  mutate(dim2_flank1 = replace_na(dim2_flank1, "Not_found"),
         dim2_flank2 = replace_na(dim2_flank2, "Not_found")) %>%
  mutate(is_same_contig = ifelse(sseqid == dim2_flank1 & sseqid == dim2_flank2,
                                 "Both",
                                 ifelse(sseqid != dim2_flank1 & sseqid != dim2_flank2,
                                 "None",
                                 ifelse(sseqid == dim2_flank1, "Flank1",
                                 ifelse(sseqid == dim2_flank2, "Flank2", "What")))))%>%
  mutate(Nb_flanking_found_2cat = ifelse(is_same_contig == "None", "0", ">1"))%>%
  filter(Nb_flanking_found_2cat == ">1")


# Table from start/end
#full_join(dim2_start, dim2_end, by = "sample") %>%
#  filter(complete.cases(.)) %>%
#  select(sample, dim2_flank1.x, dim2_flank2.x,
#         gene.x, sseqid.x, sstart.x, send.x, is_same_contig.x, Nb_flanking_found_2cat.x,
#         gene.y, sseqid.y, sstart.y, send.y, is_same_contig.y, Nb_flanking_found_2cat.y) %>%
#  left_join(., RIP %>% filter(TE == "RIP_est"), by = c("sample" = "ID_file")) %>%
#  group_by(is_same_contig.x, Continent) %>%
#  dplyr::summarise(n()) %>%
#  pivot_wider(names_from = Continent, values_from = `n()`)

# Table from the "simple" blast as comparison
#dim2_blast_results %>%
#  dplyr::filter(Nb_flanking_found_2cat == ">1") %>%
#  left_join(., RIP %>% filter(TE == "RIP_est"), by = c("sample" = "ID_file")) %>%
#  filter(length > 2500) %>%
#  dplyr::count(is_same_contig, Continent) %>%
#  pivot_wider(names_from = Continent, values_from = n)

#Keep only copies with:
#   - both flanking genes
#   - one flanking gene but a length of 2500 at least

dim2_blast_results %>%
  dplyr::filter(Nb_flanking_found_2cat == ">1") %>%
  left_join(., RIP %>% filter(TE == "RIP_est"), by = c("sample" = "ID_file")) %>%
  filter(length > 2500) %>%
  dplyr::count(is_same_contig, Continent) %>%  
  dplyr::group_by(Continent) %>%
  dplyr::mutate(Somme_per_collection = sum(n)) %>%
  mutate(prop = n*100/Somme_per_collection) %>%
  ggplot(aes(x = is_same_contig, y = Continent, fill = prop)) +
    geom_tile() +
    theme_bw() +
    scale_fill_viridis_c() +
  labs (title = str_wrap(paste0("Proportion of long dim2 matches ",
                                "with at least one flanking gene"),
                         width = 70))

#Write the bed file to extract the sequences
dim2_blast_results %>%
  dplyr::filter(Nb_flanking_found_2cat == ">1") %>%
  left_join(., RIP %>% filter(TE == "RIP_est"), by = c("sample" = "ID_file")) %>%
  dplyr::filter(length > 2500) %>%
  dplyr::mutate(start = ifelse(sstart > send, send, sstart),
                end = ifelse(sstart > send, sstart, send)) %>%
  dplyr::select(sample, sseqid, start, end) %>%
  write_delim(paste0(TE_RIP_dir, "Coordinates_dim2_all_samples.bed"),
            col_names = F)


#This command has to be run on the cluster
#while read sample chr start end ; do
# echo -e "${chr}\t${start}\t${end}" > temp.bed ;
# ~/Software/bedtools getfasta \
#    -fi /data2/alice/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/0_Spades/${sample}.fasta \
#    -bed temp.bed | sed "s/>/>${sample}:/" >> \
#    /data2/alice/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/1_Blast_dim2_deRIPped/Native_dim2_all_samples.fasta ;
#done < /data2/alice/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/1_Blast_dim2_deRIPped/Coordinates_dim2_all_samples.bed

#Run through the website because laziness
# mafft --thread 8 --threadtb 5 --threadit 0 --reorder --adjustdirection --auto input > output

For the following tree, I used the gene alignment from mafft (online) and created a neighbor-joining tree. The gene sequence from Z. passerinii was used as the outgroup sequence. Then, I attempt to represent both the geographical origin or the samples and the RIP level.

#Prep data to add to tree
temp2 = dim2_blast_results %>%
  dplyr::filter(Nb_flanking_found_2cat == ">1") %>%
  left_join(., RIP %>% filter(TE == "RIP_est"), by = c("sample" = "ID_file")) %>%
  dplyr::filter(length > 2500) %>%
  dplyr::mutate(start = ifelse(sstart > send, send, sstart),
                end = ifelse(sstart > send, sstart, send)) %>%
  unite(coord, start, end, sep = "-") %>%
  dplyr::mutate(no_dot = stringr::str_replace(sseqid, "\\.", "_"))%>%
  unite(contig2, sample, no_dot, coord, sep = "_", remove = F)


#Read tree in
#details from the mafft website about the tree
#Size = 237 sequences × 1214 sites
#Method = Neighbor-Joining
#Model = Jukes-Cantor
#Bootstrap resampling = 100

tree = as_tibble(treeio::read.tree("~/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/Essai_dim2_tree.nwk")) %>%
  mutate(label_copy = label) %>%
  separate(label_copy, into = c("nb", "contig"), extra = "merge", remove =F) %>%
  dplyr::mutate(nb = as.integer(nb)) %>%
  dplyr::mutate(label_OG = label) %>%
  dplyr::mutate(contig2 = stringr::str_replace(contig, "R_", "")) %>%
  full_join(., temp2, by = c("contig2")) %>%
  dplyr::mutate(label_temp = ifelse(is.na(sample), ifelse(is.na(contig), label, contig), sample)) %>%
  unite(col = "label_new", nb, label_temp, sep = "_")

tree2 = as_tibble(treeio::read.tree("~/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/Essai_dim2_tree.nwk"))

temp = tree %>%
  dplyr::select(label, node, label_new, Continent, Composite_median, Collection) %>%
  mutate(Composite_median = ifelse(is.na(Composite_median), 0, Composite_median)) %>%
  filter(!is.na(label)) %>%
  mutate(RIP = ifelse(Composite_median >= 1.5, "High", ifelse(Composite_median <= 1, "Low", "Medium")))

#Find the outgroup!
root_node = tree2 %>%
  filter(grepl("Zpa63", label)) %>%
  dplyr::select(node) %>%
  pull()


rooted.tree <- ape::root(as.treedata(tree2), root_node)
p <- ggtree(rooted.tree, layout = "rectangular") %<+% (temp %>% dplyr::select (-label))

p2 <- p + geom_tippoint(aes(color = Continent)) +
  theme(legend.position = "right") +
  Color_Continent

p3 <- p + geom_tippoint(aes(color = RIP), alpha = 0.5) +
  theme(legend.position = "right") +
  scale_color_manual(values = c("darkred", "yellow", "orange"))

p4 <- p + geom_tippoint(aes(color = Collection), alpha = 0.5) +
  theme(legend.position = "right")

cowplot::plot_grid(p2, p3)

#df = temp %>% dplyr::select(RIP)
#rownames(df) <- temp$label
#gheatmap(p2, df, width=.2) +
#  scale_fill_manual(values = c("darkred", "yellow", "orange"), name = "RIP") +
#  labs(title = "Gene tree for the dim2 sequence",
#       y = "")

Based on these trees, there is some clustering according to geography. Additionally, it looks like the Middle-Eastern samples have either a high or low RIP level on their TE, whereas the level is rather in the middle of the range elsewhere…

Gene duplication and RIP

PacBio samples from the pangenome

If there is a relaxation of RIP, we could expect that TEs would not be the only things impacted but that gene duplications would also be possible more in RIP-relaxed genomes than in RIP-active.

Badet_pan_table = read_tsv(paste0(TE_RIP_dir, "Badet_GLOBAL_PANGENOME_TABLE.txt"))

Badet_pan_table %>%
  dplyr::select(isolate, orthogroup, N_genes, N_isolates, cat) %>%
  group_by(isolate) %>%
  dplyr::mutate(nb_genes = n()) %>%
  group_by(isolate, N_genes) %>%
  dplyr::summarize(count = n(), nb_genes = mean(nb_genes)) %>%
  dplyr::mutate(Percent = 100 * count / nb_genes) %>%
  dplyr::filter(N_genes >= 2) %>%
  dplyr::filter(N_genes <= 10) %>%
  ggplot(aes(isolate, Percent, fill = N_genes)) +
  geom_bar(stat = "identity") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
  scale_fill_gradient(low = "#EAEAEA", high = "#294D4A", na.value = NA)

Badet_pan_table %>%
  dplyr::select(isolate, orthogroup, N_genes, N_isolates, cat) %>%
  group_by(isolate) %>%
  dplyr::mutate(nb_genes = n()) %>%
  dplyr::filter(N_genes >= 2) %>%
  dplyr::mutate(N_genes_cat = ifelse(N_genes >= 9, "9 +", as.character(N_genes))) %>%
  group_by(isolate, N_genes_cat) %>%
  dplyr::summarize(count = n(), nb_genes = mean(nb_genes)) %>%
  dplyr::mutate(Percent = 100 * count / nb_genes) %>%
  ggplot(aes(isolate, Percent, fill = N_genes_cat)) +
  geom_bar(stat = "identity") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 40, hjust = 1))

Badet_pan_table %>%
  dplyr::select(isolate, orthogroup, N_genes, N_isolates, cat) %>%
  group_by(isolate) %>%
  dplyr::mutate(nb_genes = n()) %>%
  dplyr::filter(N_genes >= 2) %>%
  dplyr::mutate(N_genes_cat = ifelse(N_genes >= 9, "9 +", as.character(N_genes))) %>%
  group_by(isolate, N_genes_cat) %>%
  dplyr::summarize(count = n(), nb_genes = mean(nb_genes)) %>%
  dplyr::mutate(Percent = 100 * count / nb_genes) %>%
  dplyr::filter(N_genes_cat == 2) %>%
  ggplot(aes(isolate, Percent, fill = N_genes_cat)) +
  geom_bar(stat = "identity") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 40, hjust = 1))

On Illumina samples

core_duplicated_genes = read_tsv(paste0(depth_per_gene_dir, "Sabina_gcnv_event.filtered.core_chr.per_sample.tsv"))

temp = inner_join(sum_dim2_blast, core_duplicated_genes, by = c("sample" = "Sample")) %>%
  filter(M < 150)


p1 = ggplot(temp, aes(as.numeric(Composite_median),
                      M,
                      color = Continent,
                      shape = Collection))+
  geom_point() + Color_Continent +
  theme_bw()  +
    labs(x = str_wrap(paste("RIP composite index",
                            " (median per isolate)"),
                      width = 20),
         y = str_wrap("Number of duplicated genes",
                      width = 20))

p2 = ggplot(temp, aes(pident_wm,
                      M,
                      color = Continent,
                      shape = Collection))+
  geom_point() + Color_Continent +
  theme_bw() + theme(legend.position = "none") +
    labs(x = str_wrap(paste("Identity of the native match"),
                      width = 20),
         y = str_wrap("Number of duplicated genes",
                      width = 20))

legend_b <- get_legend(p1 + theme(legend.position = "right"))

top = cowplot::plot_grid(p1 + theme(legend.position = "none"), p2, ncol = 1)
cowplot::plot_grid(top, legend_b, nrow = 1)

ggplot(temp, aes(as.numeric(Composite_median),
                      M,
                      color = Continent,
                      shape = Collection))+
  geom_point() + Color_Continent +
  theme_bw()  +
    labs(x = str_wrap(paste("RIP composite index", " (median per isolate)"),
                      width = 20),
         y = str_wrap("Number of duplicated genes", width = 20)) +
  facet_wrap(vars(Continent), scales = "free")

temp = inner_join(core_duplicated_genes, TE_qty, by = c("Sample" = "ID_file")) %>%
  filter(!is.na(Cluster))

filter(temp, M < 100) %>%
  ggplot(aes(M, Total_insertions, col = Continent)) + 
  geom_point()+ 
  Color_Continent +
  theme_bw() 

filter(temp, M < 100) %>%
  ggplot(aes(M, Percent_TE_Reads, col = Continent)) + 
  geom_point()+ 
  Color_Continent +
  theme_bw() 

p1 = filter(temp, M < 100) %>%
  ggplot(aes(Cluster, M, fill = Cluster)) + 
  geom_violin()+ 
  geom_boxplot(col = "white", width = .2) +
  Fill_Cluster+
  theme_bw() 
p2 = ggplot(temp, aes(Cluster, D, fill = Cluster)) + 
  geom_violin()+ 
  geom_boxplot(col = "white", width = .2) +
  Fill_Cluster+
  theme_bw() 

cowplot::plot_grid(p1 + coord_flip() + theme(legend.position = "none"),
                   p2 + coord_flip() + theme(legend.position = "none"))

GWAS

#Number of accessory chromsomes
Lthres = 0.25
Hthres = 1.75
# Reading in the summarized data
core_depth = read_tsv(paste0(depth_per_window_dir, "Depth_per_sample_core_chr_summary.q30.txt")) %>%
  mutate(Median_core = Median) 

chrom_depth = read_tsv(paste0(depth_per_window_dir, "Depth_per_chromosome_summary.q30.txt")) %>%
  left_join(., core_depth %>% dplyr::select(Sample, Median_core)) %>%
  mutate(Relative_depth = as.numeric(Median)/(0.001 + as.numeric(Median_core))) %>%
  filter(CHROM != "mt") %>%
  dplyr::mutate(Depth_is = ifelse(Relative_depth > Hthres, "High",ifelse(Relative_depth < Lthres, "Low", "Normal"))) %>%
  dplyr::mutate(PAV = ifelse(as.numeric(Relative_depth) > 0.5, "P", "D"))

temp = chrom_depth %>%
  filter(as.numeric(CHROM) > 13) %>%
  dplyr::count(Sample, PAV) %>%
  pivot_wider(names_from = PAV, values_from = n, values_fill = 0) %>%
  dplyr::select(ID_file = Sample, Acc_chr = P)

#Number of duplicated genes
duplicated_loci = core_duplicated_genes %>%
  dplyr::select(ID_file = Sample, Duplicated_genes = M) %>% 
  full_join(., temp)


#Create standardized values for the environment variables
stand = left_join(TE_RIP, duplicated_loci) %>% 
  dplyr::select(ID_file, Percent_TE_Reads, Composite_median, Composite_mean, Acc_chr, Duplicated_genes, Non_ref_insertions) %>%
  pivot_longer(cols = -c(ID_file), names_to = "Variable", values_to = "Value") %>%
  group_by(Variable) %>%
  dplyr::mutate(Standard_value = scale(Value)) %>%
  dplyr::select(-Value) %>%
  pivot_wider(names_from = Variable, values_from = Standard_value, values_fn = length)


#The fam file should be same for all core chromosomes, hopefully. So I only have to create one to get the file format including the phenotypes. This can then be used for all chromosomes.
temp2 = left_join(read_tsv(Zt_list, col_names = "ID_file") %>% mutate(ID2 = ID_file), 
                            Zt_meta %>% dplyr::select(ID_file, Latitude, Longitude)) %>%
  bind_cols(., tibble(X1 = rep(0, nrow(Zt_meta)), X2 = rep(0, nrow(Zt_meta)), X3 = rep(0, nrow(Zt_meta)))) %>%
  inner_join(., stand) 

dplyr::select(temp2, -Latitude, -Longitude) %>%
 write_delim(file = paste0(TE_RIP_dir, "Phenotypes_for_GWAS.fam"), delim = " ", col_names = F)

#Adding PC info as covariates
left_join(read_tsv(Zt_list, col_names = "ID_file"), read_tsv(paste0(PopStr_dir, vcf_name, ".PCA_results.tab"))) %>%
  mutate(Intercept = 1) %>%
  dplyr::select(Intercept, PC1, PC2, PC3, PC4) %>%
  write_delim(., paste0(TE_RIP_dir, "PC_for_correction.cov"), delim = " ", col_names = F)


#Transfer on the cluster
#rsync -avP ~/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/Phenotypes_for_GWAS.fam alice@130.125.25.244:/data2/alice/WW_project/4_TE_RIP/Phenotypes_for_GWAS.fam
#rsync -avP ~/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/PC_for_correction.cov alice@130.125.25.244:/data2/alice/WW_project/4_TE_RIP/2_GWAS/PC_for_correction.cov
  
#To run gemma on the cluster: conda activate env0
#Commands are then in the format gemma -h 

 
# GWAS
#rsync -avP alice@130.125.25.244:/data2/alice/WW_project/4_TE_RIP/2_GWAS/output/Results_genomics_characteristics_* ~/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/2_GWAS/
phenotypes = c("TE", "RIP_med", "RIP_mean", "Accessory_chr", "Duplicated_genes", "TE_insertions")

results_GWAS = list()
for (i in c(1:length(phenotypes))) {
  temp = list.files(path = paste0(TE_RIP_dir, "2_GWAS/"), 
                    pattern = paste0("*phenotype_", i,"..subset_ALL.assoc.txt")) %>% 
    map_df(~read_tsv(file = paste0(paste0(TE_RIP_dir, "2_GWAS/"), .), 
                     col_types = c("dcddccdddddd"))) %>%
    unite(col = SNP, chr, ps, sep = "_", remove = F) %>%
    dplyr::select(SNP, CHR = chr, BP = ps, P = p_wald, zscore = logl_H1, beta) %>%
    mutate(Phenotype = phenotypes[i], Nb = i)
  results_GWAS[[phenotypes[i]]] = temp
}

results_GWAS = bind_rows(results_GWAS)

#left_join(TE_RIP, duplicated_loci) %>% 
#  dplyr::select(ID_file, Percent_TE_Reads, Composite_median, Composite_mean, Acc_chr, Duplicated_genes) %>%
#  pivot_longer(cols = -c(ID_file), names_to = "Variable", values_to = "Value")
#Genomic Inflation Factor
kable(results_GWAS %>% 
        group_by(Phenotype) %>%
        summarize(Genomic_Inflation_Factor = median(qchisq(1-P,1))/qchisq(0.5,1)) %>%
        arrange(Genomic_Inflation_Factor))
Phenotype Genomic_Inflation_Factor
Duplicated_genes 0.8815758
RIP_med 0.8981546
TE_insertions 0.9754432
RIP_mean 1.0226379
Accessory_chr 1.0530365
TE 1.0728435
#QQ-plots
par(mfrow=c(3,1)) 
for (i in c(1:length(phenotypes))){
temp = results_GWAS %>% filter(Phenotype == phenotypes[i]) %>% dplyr::sample_frac(size = .10) %>% dplyr::select(P) %>% pull()
GWASTools::qqPlot(temp, thinThreshold = 2)
}

head(results_GWAS) 
## # A tibble: 6 x 8
##   SNP        CHR     BP     P zscore     beta Phenotype    Nb
##   <chr>    <dbl>  <dbl> <dbl>  <dbl>    <dbl> <chr>     <int>
## 1 1_111115     1 111115 0.899 -1068.  0.0131  TE            1
## 2 1_111124     1 111124 0.228 -1067. -0.124   TE            1
## 3 1_111127     1 111127 0.232 -1067. -0.123   TE            1
## 4 1_111283     1 111283 0.831 -1068. -0.0113  TE            1
## 5 1_111350     1 111350 0.987 -1068. -0.00201 TE            1
## 6 1_111358     1 111358 0.944 -1068.  0.00935 TE            1
Bonferroni_threshold = results_GWAS %>% dplyr::count(Nb) %>%
  mutate(Bonferroni_threshold = 0.05/n) %>% summarize(average = mean(Bonferroni_threshold)) %>% pull()

kable(results_GWAS %>% 
  filter(P <= Bonferroni_threshold) %>% 
  dplyr::count(Phenotype) %>% 
    arrange(n))
Phenotype n
Accessory_chr 10
TE 99
RIP_med 145
TE_insertions 193
RIP_mean 286
Duplicated_genes 786
results_GWAS = results_GWAS %>% mutate(SNP_type = ifelse(P <= Bonferroni_threshold, "significant", "NS"))

significant_TE = results_GWAS %>%
  filter(P <= Bonferroni_threshold)  %>% group_by(SNP, CHR, BP) %>%
  dplyr::mutate(Count = n()) %>% mutate(SNP_type = "significant")%>% 
  ungroup() 

significant_TE %>%
  dplyr::select(CHR, BP) %>% 
  distinct() %>%
  write_delim(paste0(TE_RIP_dir, "2_GWAS/Significant_TE.tsv"), delim = "\t", col_names = F)

Identifying the variants which are significant is very useful already, but it is hard to work with so many “independent” positions. It also does not make sense conceptually, as nearby variants are probably picking up the exact same signal through linkage. So, here I gather variants together into “significant loci” if they are no bigger gaps than a certain threshold.

#Distance between two significant SNPs to include them in the same region
len_threshold = 20000L

#For each phenotype and for each chromosome

list_loci = list()
for (j in c(1:length(phenotypes))) {
  temp2 = list()
  for (i in c(1:13)){
    #Pull vector of position for the right chromosome and phenotype
    v = filter(significant_TE, CHR == i) %>% 
      filter(Phenotype == phenotypes[j]) %>%
      dplyr::select(CHR, BP) %>% 
      distinct() %>% 
      pull(BP) %>% sort()
    
    # Split the vectors into group based on the length threshold
    temp = split(v, cumsum(c(TRUE, diff(v) >= len_threshold)))
    #print(paste(c(phenotypes[j], i, length(temp)), sep = " "))
    
    temp2[[i]] = temp %>% map_df(enframe, .id = 'Locus_nb') %>% 
      dplyr::rename(BP = value)%>%
      mutate(CHR = i)
  }
list_loci[[j]] = bind_rows(temp2)
}

#Adding loci info to the table
significant_TE = bind_rows(list_loci) %>%
  full_join(significant_TE, .)

significant_TE %>% dplyr::select(CHR, Locus_nb, Phenotype) %>%
  distinct() %>% 
  dplyr::count(CHR, Phenotype)%>%
  pivot_wider(names_from = Phenotype, values_from = n, values_fill = 0)
## # A tibble: 13 x 7
##      CHR Duplicated_genes RIP_med RIP_mean    TE TE_insertions Accessory_chr
##    <dbl>            <int>   <int>    <int> <int>         <int>         <int>
##  1     1               25       4        0     0             0             0
##  2     2               22       4        3     4             3             0
##  3     3               26       5        0     2             2             0
##  4     4               24       0        0     0             2             1
##  5     5               23       4        2     4             4             0
##  6     6               16       5        3     0             0             0
##  7     7               17       9        7     6             7             0
##  8     8               15       3        0     0             2             0
##  9     9                8       1        0     0             0             0
## 10    10               16       1        0     2             0             0
## 11    11               11       0        0     0             0             0
## 12    12               11       2        0     2             2             0
## 13    13                6       3        3     0             0             0
#rsync -avP ../4_TE_RIP/2_GWAS/Significant_TE.tsv alice@130.125.25.244:/data2/alice/WW_project/4_TE_RIP/2_GWAS/

#../Software/vcftools_jydu/src/cpp/vcftools --gzvcf /data2/alice/WW_project/1_Variant_calling/4_Joint_calling/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.ann.vcf.gz --positions /data2/alice/WW_project/4_TE_RIP/2_GWAS/Significant_TE.tsv --out /data2/alice/WW_project/4_TE_RIP/2_GWAS/Significant_TE --recode --recode-INFO-all

#../Software/bcftools-1.10.2/bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\t%INFO/ANN\n' /data2/alice/WW_project/4_TE_RIP/2_GWAS/Significant_TE.recode.vcf > /data2/alice/WW_project/4_TE_RIP/2_GWAS/Significant_TE.ann.tsv

#grep "#CHROM" /data2/alice/WW_project/4_TE_RIP/2_GWAS/Significant_TE.recode.vcf | cut -f 1,2,4,5,10- | sed 's/#//' > /data2/alice/WW_project/4_TE_RIP/2_GWAS/Significant_TE.ann.tab
#../Software/bcftools-1.10.2/bcftools query -f '%CHROM\t%POS\t%REF\t%ALT[\t%GT]\n' /data2/alice/WW_project/4_TE_RIP/2_GWAS/Significant_TE.recode.vcf >> /data2/alice/WW_project/4_TE_RIP/2_GWAS/Significant_TE.ann.tab

#rsync -avP alice@130.125.25.244:/data2/alice/WW_project/4_TE_RIP/2_GWAS/Significant_TE.ann.t* ../4_TE_RIP/2_GWAS/

For each region, I would like to be able to get more information about the variants: what is the distribution of the alleles compared to the phenotype with which this position was significantly associated? What is the minor allele frequency at this locus? Since it is impossible to represent all thousands of associated variants, I chose one position in particular for each significant locus identified previously.

#i=2
min_length_locus = 1000

whatever <- function(min_length_locus, pheno, expected_output){
  temp = significant_TE %>% 
    filter(Phenotype == pheno) %>% 
    group_by(CHR, Locus_nb) %>%
    mutate(Locus_length = 1 + max(BP) - min(BP)) %>%
    filter(Locus_length >= min_length_locus) %>%
    dplyr::mutate(Rank = rank(P)) %>%
    arrange(Rank) %>%
    ungroup() %>%
    filter(Rank == 1) %>%
    dplyr::select(SNP, CHR, BP)

  relevant_pheno = TE_RIP %>% 
    dplyr::select(ID_file, TE = Percent_TE_Reads, RIP_med = Composite_median, RIP_mean = Composite_mean) %>%
    pivot_longer(cols = -c(ID_file), names_to = "Phenotype", values_to = "Value") %>%
    filter(Phenotype == pheno)

  
  temp = read_delim(paste0(TE_RIP_dir, "2_GWAS/Significant_TE.ann.tab"), na = c("."), delim = "\t") %>%
    inner_join(temp, by = c("CHROM" = "CHR", "POS" = "BP")) %>%
    pivot_longer(-c(CHROM, POS, REF, ALT, SNP), 
                 names_to = "ID_file", values_to = "Allele") %>%
    inner_join(., relevant_pheno) %>%
    filter(Allele != "NA")

  
  if (expected_output == "plot"){
  temp  %>%
    ggplot(aes(Value, fill = as.factor(Allele), col = as.factor(Allele))) +
    geom_density(alpha = .4) + 
    facet_wrap(vars(SNP), scales = "free") +
    theme_bw() +
    labs(title = paste0("Top associated SNPs for each locus and phenotype: ", pheno))}
  else{
  temp %>% dplyr::count(SNP, Allele) %>%
    pivot_wider(names_from = Allele, values_from = n)
  }
}

whatever(min_length_locus, phenotypes[1], "plot")

whatever(min_length_locus, phenotypes[1], "table")
## # A tibble: 3 x 3
##   SNP          `0`   `1`
##   <chr>      <int> <int>
## 1 12_1201012   801    25
## 2 3_3213337   1178   518
## 3 5_89642      808    43
whatever(min_length_locus, phenotypes[2], "plot")

whatever(min_length_locus, phenotypes[2], "table")
## # A tibble: 8 x 3
##   SNP         `0`   `1`
##   <chr>     <int> <int>
## 1 1_2077079   748   102
## 2 10_704757   823    23
## 3 6_1367971   824    27
## 4 6_820378    504  1196
## 5 7_1183768   736    96
## 6 7_1493135   725   125
## 7 7_558777    791    60
## 8 8_2018754   834    17
#Find genes and corresponding effects
temp = read_tsv(paste0(TE_RIP_dir, "2_GWAS/Significant_TE.ann.tsv"), col_names = c("CHR", "BP", "REF", "ALT", "ANN")) %>%
  separate(ANN, sep = ",", 
           into = c("ANN1", "ANN2", "ANN3", "ANN4", "ANN5", "ANN6", "ANN7", "ANN8", "ANN9", "ANN10", "ANN11",
                    "ANN12", "ANN13", "ANN14", "ANN15", "ANN16", "ANN17"), 
           extra = "warn")

signif_ann_TE = temp %>%
  pivot_longer(cols = c(-CHR, -BP, -REF, -ALT), names_to = "ANN", values_to = "Annotation") %>%
  filter(!is.na(Annotation)) %>%
  separate(Annotation, into = c("ALT2", "Variant_type", "Effect_strength", "Gene", "Rest"), sep = "\\|", extra = "merge") %>%
  filter(!str_detect(Gene, "-")) %>%
  separate(Gene, into = c("temp", "Chrom", "Gene_nb"), remove = FALSE)  %>%
  dplyr::select(-ANN, -ALT2, -temp) %>%
  mutate(Gene_nb = as.numeric(Gene_nb)) %>%
  left_join(., read_tsv(effectors_annotation_file) %>% 
              filter(Sample == "Zt09") %>% 
              mutate(Gene = str_replace(Protein_ID, "_chr", "")))
# Compute chromosome size and cumulative position of each chromosome
results_GWAS_for_plot <- results_GWAS %>% 
  group_by(CHR) %>% 
  dplyr::summarise(chr_len=max(BP)) %>% 
  mutate(tot=cumsum(chr_len)-chr_len) %>%
  dplyr::select(-chr_len) 

#EStimate middle of chromosome to center the labels
axisdf = results_GWAS_for_plot %>%
  inner_join( ., results_GWAS %>% filter(Nb == 1)) %>%
  arrange(CHR, BP) %>%
  mutate(BPcum = BP+tot) %>% 
  group_by(CHR) %>% 
  dplyr::summarise(center=( max(BPcum) + min(BPcum) ) / 2 )


# Manhattan plot function
Manhattan_one_pheno <- function(pheno, color_duo) {
  
  temp = results_GWAS %>% filter(Phenotype == pheno) %>%
    inner_join(., results_GWAS_for_plot)  %>%
    left_join(., significant_TE %>% dplyr::filter(Phenotype == pheno) ) %>%
    mutate(SNP_type = ifelse(is.na(SNP_type), "Other", SNP_type)) %>%
    mutate(logP = -log10(P)) %>%
    filter(CHR <= 13) %>%
    filter(logP > 2) 
  
  ggplot(temp, aes(x=BP, y=logP)) +
    geom_point( aes(color=SNP_type), alpha = 0.8, size=1.3) +
    scale_color_manual(values = color_duo) +
    geom_hline(aes(yintercept=-log10(Bonferroni_threshold)),
               linetype = "dashed", color = "grey20") +
    theme_bw() +
    theme( legend.position="none", panel.border = element_blank(),
      panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(),
      axis.title.x=element_blank(), axis.text.x=element_blank(),
      axis.ticks.x=element_blank(),
      strip.background = element_rect(colour="white", fill="white"),
      plot.margin = unit(c(0, 0, 0, 0), "cm")) +
      facet_grid(cols = vars(CHR), scales = "free_x") 
}


#Plot from gff
plot_gff <- function(gff_name, chromosome, min_coord, max_coord, genes_to_highlight, col = "black") {
  gff = read_tsv(gff_name, 
               col_names = c("CHR", "X1", "mRNA", "Start", "Stop", "X2", "X3", "X4", "Annot"))  %>%
  separate(col = Annot, into = c("ID", "Parent", "Name"), sep = ";") %>%
  mutate(ID = str_remove(ID, "ID=")) %>% 
  filter(CHR == chromosome) %>%
  filter(Start >= min_coord) %>%
  filter(Stop <= max_coord) %>%
  mutate(y_value = rank(Start)) %>%
  arrange(Start)

## If too many genes, organize them
if (nrow(gff) > 5) {
  temp = ceiling(nrow(gff)/5)
  gff = gff %>% mutate(y_value = rep(c(1:5), temp)[1:nrow(gff)])
}

## Make plot
c = gff %>%
  ggplot() +
  geom_segment(mapping = aes(x = Start, xend = Stop, y = y_value, yend = y_value), size = 2, col = col) +
  geom_text(aes(x = Stop + (max_coord - min_coord)*0.05 , y = y_value, label = ID), size = 2) +
  theme_bw() + 
  theme(axis.title.y = element_blank(), axis.text.y = element_blank(), 
        axis.title.x = element_blank(), axis.ticks.y = element_blank(), 
        panel.grid.minor.y = element_blank(), panel.grid.major.y = element_blank()) +
  coord_cartesian(xlim = c(min_coord, max_coord), ylim = c(min(gff$y_value) - 0.5, max(gff$y_value) + 0.5))

## Highlight interesting genes
temp = match(genes_to_highlight, gff$ID)
if ( sum(!is.na(temp)) > 0 ){
  c = c + geom_segment(data = filter(gff, ID == genes_to_highlight),
                   mapping = aes(x = Start, xend = Stop, y = y_value, yend = y_value), 
                   size = 3, col = "blue")
}

c
}
temp = tibble(CHR = c(5, 6), Gene = c("rid", "dnmt5"), Pos = c(255000, 2527000))

p = Manhattan_one_pheno("TE", c("#FFD399", "#ff9f1c")) +
  labs(y = str_wrap("TE content (-logP)", 25)) + 
  geom_vline(data = temp, aes(xintercept = Pos), alpha =.4, col = "grey20")

q = Manhattan_one_pheno("TE_insertions", c("#cbf3f0", "#2ec4b6")) +
  labs(y = str_wrap("TE insertions (-logP)", 25)) + 
  geom_vline(data = temp, aes(xintercept = Pos), alpha =.4, col = "grey20") +
      geom_text(data = temp, aes(label = Gene, x= Pos), 
                y = -5, # Set the position of the text to always be at '14.25'
                size = 4, angle = 90,
                vjust = 0, hjust = 0) +
      coord_cartesian(ylim = c(-3.8, 27), # This focuses the x-axis on the range of interest
                      clip = 'off')

cowplot::plot_grid(p, q, ncol = 1)

Manhattan_one_pheno("RIP_med", c("#cbf3f0", "#2ec4b6")) +
  labs(y = str_wrap("RIP (-logP)", 25)) + 
  geom_vline(data = temp, aes(xintercept = Pos), alpha =.4, col = "grey20") +
      geom_text(data = temp, aes(label = Gene, x= Pos), 
                y = -5, # Set the position of the text to always be at '14.25'
                size = 4, angle = 90,
                vjust = 0, hjust = 0) +
      coord_cartesian(ylim = c(-3.8, 27), # This focuses the x-axis on the range of interest
                      clip = 'off')

#Set parameters for the plot
#chromosome = 5
#min_coord = 220000
#max_coord = 270000

#chromosome = 13
#min_coord = 800048
#max_coord = 1129645

#chromosome = 3
#min_coord = 3190000
#max_coord = 3218000

chromosome = 5
min_coord = 220000
max_coord = 270000
genes_to_highlight = c("Zt09_model_5_00047")
TE_gff = "~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Cecile_IPO323_refTEs_match.gff"
gene_gff = "~/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Zymoseptoria_tritici.MG2.Grandaubert2015.mRNA.gff3"

# Prepare the gene section of the plot
c = plot_gff(TE_gff, chromosome, min_coord, max_coord, genes_to_highlight, "grey50")
d = plot_gff(gene_gff, chromosome, min_coord, max_coord, genes_to_highlight)

# Prepare the GWAS section of the plot for the first phenotype
b = results_GWAS %>% filter(CHR == chromosome) %>%
  filter(Phenotype == "RIP_med") %>%
  filter(BP >= min_coord) %>%
  filter(BP <= max_coord) %>%
  mutate(logP = -log10(P)) %>%
  ggplot() +
  geom_point(aes(x = BP, y = logP, col = logP)) +
    geom_hline(aes(yintercept=-log10(Bonferroni_threshold)),
               linetype = "dashed", color = "grey20") +
  theme_bw() +
  labs(title = "GWAS based on RIP composite index median",
       subtitle = paste0("The region is on the chromosome ", chromosome, 
                           " from ", min_coord, "bp to ", max_coord, "bp.")) +
  theme(axis.title.x = element_blank()) + 
  coord_cartesian(xlim = c(min_coord, max_coord))

cowplot::plot_grid(b, c, d, ncol = 1, align = 'v', axis = 'lr', rel_heights = c(1, 0.3, 0.3))

# Prepare the second plot for the second phenotype 
b = results_GWAS %>% filter(CHR == chromosome) %>%
  filter(Phenotype == "TE") %>%
  filter(BP >= min_coord) %>%
  filter(BP <= max_coord) %>%
  mutate(logP = -log10(P)) %>%
  ggplot() +
  geom_point(aes(x = BP, y = logP, col = logP)) +
    geom_hline(aes(yintercept=-log10(Bonferroni_threshold)),
               linetype = "dashed", color = "grey20")+
  theme_bw() +
  labs(title = "GWAS based on proportion of TE reads",
       subtitle = paste0("The region is on the chromosome ", chromosome, 
                           " from ", min_coord, "bp to ", max_coord, "bp.")) +
  theme(axis.title.x = element_blank()) + 
  coord_cartesian(xlim = c(min_coord, max_coord))
cowplot::plot_grid(b,  d, c, ncol = 1, align = 'v', axis = 'lr', rel_heights = c(1, 0.3, 0.3))



p = Manhattan_one_pheno("Duplicated_genes", c("#FFD399", "#ff9f1c")) +
  labs(y = str_wrap("Duplicated genes(-logP)", 25)) 

q = Manhattan_one_pheno("Accessory_chr", c("#cbf3f0", "#2ec4b6")) +
  labs(y = str_wrap("Accessory chromosome number (-logP)", 25)) +
      coord_cartesian(ylim = c(-3.8, 27), # This focuses the x-axis on the range of interest
                      clip = 'off')

cowplot::plot_grid(p, q, ncol = 1)